diff --git a/.env b/.env index fd07742b..92f7904b 100644 --- a/.env +++ b/.env @@ -4,7 +4,7 @@ ENVIRONMENT=dev DB_HOST=localhost DB_USER=postgres DB_PORT=5432 -DB_PASSWORD=admin +DB_PASSWORD=quick DB_NAME=QuickGpt SUPER_ADMIN_EMAIL=superadmin@email.com @@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com SMTP_USERNAME=shresthasaurab030 SMTP_PASSWORD=huurxwxeorxjorzw -LDAP_SERVER=ldap://192.168.101.111 \ No newline at end of file +LDAP_SERVER=ldap://192.168.101.111 +LDAP_ENABLE=False \ No newline at end of file diff --git a/private_gpt/components/ocr_components/table_ocr_api.py b/private_gpt/components/ocr_components/table_ocr_api.py index 339cb365..da53fd85 100644 --- a/private_gpt/components/ocr_components/table_ocr_api.py +++ b/private_gpt/components/ocr_components/table_ocr_api.py @@ -1,64 +1,87 @@ -from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request -from fastapi.responses import FileResponse -from pydantic import BaseModel -from docx import Document import os import fitz import requests +from docx import Document + +from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends +from sqlalchemy.orm import Session + +from private_gpt.users import models +from private_gpt.users.api import deps +from private_gpt.users.constants.role import Role from private_gpt.components.ocr_components.TextExtraction import ImageToTable from private_gpt.components.ocr_components.table_ocr import GetOCRText -from private_gpt.server.ingest.ingest_router import ingest_file -upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads" +from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse +from private_gpt.constants import OCR_UPLOAD + + +pdf_router = APIRouter(prefix="/pdf", tags=["ocr"]) -pdf_router = APIRouter(prefix="/pdf", tags=["auth"]) @pdf_router.post("/pdf_ocr") -async def get_pdf_ocr(request: Request, file: UploadFile = File(...)): - UPLOAD_DIR = upload_dir +async def get_pdf_ocr( + request: Request, + db: Session = Depends(deps.get_db), + file: UploadFile = File(...), + current_user: models.User = Security( + deps.get_current_user, + scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]], + ) +): + UPLOAD_DIR = OCR_UPLOAD try: contents = await file.read() - except Exception: - return {"message": "There was an error uploading the file"} - - # Save the uploaded file to the dir + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"There was an error reading the file: {e}" + ) + file_path = os.path.join(UPLOAD_DIR, file.filename) with open(file_path, "wb") as f: f.write(contents) doc = Document() - ocr = GetOCRText() + ocr = GetOCRText() img_tab = ImageToTable() pdf_doc = fitz.open(file_path) - for page_index in range(len(pdf_doc)): # iterate over pdf pages - page = pdf_doc[page_index] # get the page + # try: + for page_index in range(len(pdf_doc)): + page = pdf_doc[page_index] image_list = page.get_images() - for image_index, img in enumerate(image_list, start=1): # enumerate the image list + if not image_list: + continue + + for image_index, img in enumerate(image_list, start=1): xref = img[0] pix = fitz.Pixmap(pdf_doc, xref) if pix.n - pix.alpha > 3: - pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples) - image_path = "page_%s-image_%s.png" % (page_index, image_index) - pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png - pixs = None - extracted_text = ocr.extract_text(image_file=True, file_path=image_path) + pix = fitz.Pixmap(fitz.csRGB, pix)( + "RGB", [pix.width, pix.height], pix.samples) + + image_path = f"page_{page_index}-image_{image_index}.png" + pix.save(image_path) + extracted_text = ocr.extract_text( + image_file=True, file_path=image_path) doc.add_paragraph(extracted_text) table_data = img_tab.table_to_csv(image_path) - print(table_data) doc.add_paragraph(table_data) - # remove image file + os.remove(image_path) - save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx") + save_path = os.path.join( + UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}") doc.save(save_path) - with open(save_path,'rb') as f: + with open(save_path, 'rb') as f: file_content = f.read() - starfleet_data = { - "filename": f.name, - "file_content": file_content, - "file_type": "multipart/form-data" - } - requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"}) - return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf") - + if not file_content: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Empty file content after processing OCR" + ) + ingested_documents = await common_ingest_logic( + request=request,db=db, ocr_file=save_path, current_user=current_user + ) + return IngestResponse(object="list", model="private-gpt", data=ingested_documents) diff --git a/private_gpt/constants.py b/private_gpt/constants.py index fc1e4f87..d9ca9d1c 100644 --- a/private_gpt/constants.py +++ b/private_gpt/constants.py @@ -3,4 +3,6 @@ from pathlib import Path PROJECT_ROOT_PATH: Path = Path(__file__).parents[1] script_dir = os.path.dirname(os.path.abspath(__file__)) -UPLOAD_DIR = os.path.join(script_dir, "static") +UPLOAD_DIR = os.path.join(script_dir, "static") # Actual upload path for uploaded file + +OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file diff --git a/private_gpt/server/ingest/ingest_router.py b/private_gpt/server/ingest/ingest_router.py index 871527c2..731ffce8 100644 --- a/private_gpt/server/ingest/ingest_router.py +++ b/private_gpt/server/ingest/ingest_router.py @@ -223,48 +223,48 @@ def ingest_file( ) - - -def ingest_pdf_file( - request: Request, - db: Session = Depends(deps.get_db), - file: UploadFile = File(...), -) -> IngestResponse: - """Ingests and processes a file, storing its chunks to be used as context.""" +async def common_ingest_logic( + request: Request, + db: Session, + ocr_file, + current_user, +): service = request.state.injector.get(IngestService) - try: - file_ingested = crud.documents.get_by_filename(db, file_name=file.filename) - if file_ingested: - raise HTTPException( - status_code=status.HTTP_409_CONFLICT, - detail="File already exists. Choose a different file.", - ) - - if file.filename is None: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="No file name provided", - ) + with open(ocr_file, 'rb') as file: + file_name = Path(ocr_file).name + upload_path = Path(f"{UPLOAD_DIR}/{file_name}") - try: - docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id) + file_ingested = crud.documents.get_by_filename( + db, file_name=file_name) + if file_ingested: + raise HTTPException( + status_code=409, + detail="File already exists. Choose a different file.", + ) + + if file_name is None: + raise HTTPException( + status_code=400, + detail="No file name provided", + ) + + docs_in = schemas.DocumentCreate( + filename=file_name, uploaded_by=current_user.id) crud.documents.create(db=db, obj_in=docs_in) - except Exception as e: - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="Unable to upload file.", - ) - upload_path = Path(f"{UPLOAD_DIR}/{file.filename}") - with open(upload_path, "wb") as f: - f.write(file.file.read()) + with open(upload_path, "wb") as f: + f.write(file.read()) - with open(upload_path, "rb") as f: - ingested_documents = service.ingest_bin_data(file.filename, f) - logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.") + # Ingest binary data + file.seek(0) # Move the file pointer back to the beginning + ingested_documents = service.ingest_bin_data(file_name, file) + + logger.info( + f"{file_name} is uploaded by the {current_user.fullname}.") + + return ingested_documents - return IngestResponse(object="list", model="private-gpt", data=ingested_documents) except HTTPException: raise @@ -272,6 +272,6 @@ def ingest_pdf_file( logger.error(f"There was an error uploading the file(s): {str(e)}") print("ERROR: ", e) raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + status_code=500, detail="Internal Server Error: Unable to ingest file.", ) diff --git a/private_gpt/users/api/v1/routers/auth.py b/private_gpt/users/api/v1/routers/auth.py index feadb1fe..bf58744f 100644 --- a/private_gpt/users/api/v1/routers/auth.py +++ b/private_gpt/users/api/v1/routers/auth.py @@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas from private_gpt.users.utils import send_registration_email, Ldap LDAP_SERVER = settings.LDAP_SERVER -LDAP_ENABLE = False +LDAP_ENABLE = settings.LDAP_ENABLE router = APIRouter(prefix="/auth", tags=["auth"]) diff --git a/private_gpt/users/core/config.py b/private_gpt/users/core/config.py index ebf21b46..34d668cc 100644 --- a/private_gpt/users/core/config.py +++ b/private_gpt/users/core/config.py @@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p port='5432', db_name='QuickGpt', username='postgres', - password="admin", + password="quick", ) class Settings(BaseSettings): @@ -38,6 +38,8 @@ class Settings(BaseSettings): SMTP_PASSWORD: str LDAP_SERVER: str + LDAP_ENABLE: str + @property def SQLALCHEMY_DATABASE_URI(self) -> str: return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"