Solved the ingestion issue for ocred pdf

2025-06-29 16:58:00 +00:00 · 2024-02-17 10:23:57 +05:45 · 2024-02-17 10:23:57 +05:45 · c8b39c898c
commit c8b39c898c
parent d849ee76f4
6 changed files with 103 additions and 75 deletions
--- a/.env
+++ b/.env
@ -4,7 +4,7 @@ ENVIRONMENT=dev
 DB_HOST=localhost
 DB_USER=postgres
 DB_PORT=5432
-DB_PASSWORD=admin
+DB_PASSWORD=quick
 DB_NAME=QuickGpt

 SUPER_ADMIN_EMAIL=superadmin@email.com
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
 SMTP_USERNAME=shresthasaurab030
 SMTP_PASSWORD=huurxwxeorxjorzw

-LDAP_SERVER=ldap://192.168.101.111
+LDAP_SERVER=ldap://192.168.101.111
+LDAP_ENABLE=False
--- a/private_gpt/components/ocr_components/table_ocr_api.py
+++ b/private_gpt/components/ocr_components/table_ocr_api.py
@ -1,64 +1,87 @@
-from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
-from fastapi.responses import FileResponse
-from pydantic import BaseModel
-from docx import Document
 import os
 import fitz
 import requests
+from docx import Document
+
+from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
+from sqlalchemy.orm import Session
+
+from private_gpt.users import models
+from private_gpt.users.api import deps
+from private_gpt.users.constants.role import Role
 from private_gpt.components.ocr_components.TextExtraction import ImageToTable
 from private_gpt.components.ocr_components.table_ocr import GetOCRText
-from private_gpt.server.ingest.ingest_router import ingest_file
-upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
+from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
+from private_gpt.constants import OCR_UPLOAD
+
+
+pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])

-pdf_router = APIRouter(prefix="/pdf", tags=["auth"])

@pdf_router.post("/pdf_ocr")
-async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
-    UPLOAD_DIR = upload_dir
+async def get_pdf_ocr(
+    request: Request,
+    db: Session = Depends(deps.get_db),
+    file: UploadFile = File(...),
+    current_user: models.User = Security(
+        deps.get_current_user,
+        scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
+    )
+):
+    UPLOAD_DIR = OCR_UPLOAD
    try:
        contents = await file.read()
-    except Exception:
-        return {"message": "There was an error uploading the file"}
-    
-    # Save the uploaded file to the dir
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"There was an error reading the file: {e}"
+        )
+
    file_path = os.path.join(UPLOAD_DIR, file.filename)
    with open(file_path, "wb") as f:
        f.write(contents)

    doc = Document()
-    ocr = GetOCRText() 
+    ocr = GetOCRText()
    img_tab = ImageToTable()
    pdf_doc = fitz.open(file_path)
-    for page_index in range(len(pdf_doc)): # iterate over pdf pages
-        page = pdf_doc[page_index] # get the page
+    # try:
+    for page_index in range(len(pdf_doc)):
+        page = pdf_doc[page_index]
        image_list = page.get_images()

-        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
+        if not image_list:
+            continue
+
+        for image_index, img in enumerate(image_list, start=1):
            xref = img[0]
            pix = fitz.Pixmap(pdf_doc, xref)

            if pix.n - pix.alpha > 3:
-                pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples)
-            image_path = "page_%s-image_%s.png" % (page_index, image_index)
-            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
-            pixs = None
-            extracted_text = ocr.extract_text(image_file=True, file_path=image_path)
+                pix = fitz.Pixmap(fitz.csRGB, pix)(
+                    "RGB", [pix.width, pix.height], pix.samples)
+
+            image_path = f"page_{page_index}-image_{image_index}.png"
+            pix.save(image_path)
+            extracted_text = ocr.extract_text(
+                image_file=True, file_path=image_path)
            doc.add_paragraph(extracted_text)
            table_data = img_tab.table_to_csv(image_path)
-            print(table_data)
            doc.add_paragraph(table_data)
-            # remove image file
+            os.remove(image_path) 

-    save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
+    save_path = os.path.join(
+        UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
    doc.save(save_path)

-    with open(save_path,'rb') as f:
+    with open(save_path, 'rb') as f:
        file_content = f.read()
-        starfleet_data = {
-            "filename": f.name,
-            "file_content": file_content,
-            "file_type": "multipart/form-data"
-        }
-    requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
-    return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
-
+        if not file_content:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Empty file content after processing OCR"
+            )
+    ingested_documents = await common_ingest_logic(
+        request=request,db=db, ocr_file=save_path, current_user=current_user
+    )
+    return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
--- a/private_gpt/constants.py
+++ b/private_gpt/constants.py
@ -3,4 +3,6 @@ from pathlib import Path

 PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
 script_dir = os.path.dirname(os.path.abspath(__file__))
-UPLOAD_DIR = os.path.join(script_dir, "static")
+UPLOAD_DIR = os.path.join(script_dir, "static")  # Actual upload path for uploaded file
+
+OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file
--- a/private_gpt/server/ingest/ingest_router.py
+++ b/private_gpt/server/ingest/ingest_router.py
@ -223,48 +223,48 @@ def ingest_file(
        )


-
-
-def ingest_pdf_file(
-        request: Request,
-        db: Session = Depends(deps.get_db),
-        file: UploadFile = File(...),
-) -> IngestResponse:
-    """Ingests and processes a file, storing its chunks to be used as context."""
+async def common_ingest_logic(
+    request: Request,
+    db: Session,
+    ocr_file,
+    current_user,
+):
    service = request.state.injector.get(IngestService)
-
    try:
-        file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
-        if file_ingested:
-            raise HTTPException(
-                status_code=status.HTTP_409_CONFLICT,
-                detail="File already exists. Choose a different file.",
-            )
-        
-        if file.filename is None:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="No file name provided",
-            )
+        with open(ocr_file, 'rb') as file:
+            file_name = Path(ocr_file).name
+            upload_path = Path(f"{UPLOAD_DIR}/{file_name}")

-        try:
-            docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
+            file_ingested = crud.documents.get_by_filename(
+                db, file_name=file_name)
+            if file_ingested:
+                raise HTTPException(
+                    status_code=409,
+                    detail="File already exists. Choose a different file.",
+                )
+
+            if file_name is None:
+                raise HTTPException(
+                    status_code=400,
+                    detail="No file name provided",
+                )
+
+            docs_in = schemas.DocumentCreate(
+                filename=file_name, uploaded_by=current_user.id)
            crud.documents.create(db=db, obj_in=docs_in)
-        except Exception as e:
-            raise HTTPException(
-                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail="Unable to upload file.",
-            )
-        upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")

-        with open(upload_path, "wb") as f:
-            f.write(file.file.read())
+            with open(upload_path, "wb") as f:
+                f.write(file.read())

-        with open(upload_path, "rb") as f:
-            ingested_documents = service.ingest_bin_data(file.filename, f)
-        logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
+            # Ingest binary data
+            file.seek(0)  # Move the file pointer back to the beginning
+            ingested_documents = service.ingest_bin_data(file_name, file)
+
+        logger.info(
+            f"{file_name} is uploaded by the {current_user.fullname}.")
+
+        return ingested_documents

-        return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
    except HTTPException:
        raise

@ -272,6 +272,6 @@ def ingest_pdf_file(
        logger.error(f"There was an error uploading the file(s): {str(e)}")
        print("ERROR: ", e)
        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            status_code=500,
            detail="Internal Server Error: Unable to ingest file.",
        )
--- a/private_gpt/users/api/v1/routers/auth.py
+++ b/private_gpt/users/api/v1/routers/auth.py
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
 from private_gpt.users.utils import send_registration_email, Ldap

 LDAP_SERVER = settings.LDAP_SERVER
-LDAP_ENABLE = False
+LDAP_ENABLE = settings.LDAP_ENABLE

 router = APIRouter(prefix="/auth", tags=["auth"])

--- a/private_gpt/users/core/config.py
+++ b/private_gpt/users/core/config.py
@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
    port='5432',
    db_name='QuickGpt',
    username='postgres',
-    password="admin",
+    password="quick",
 )

 class Settings(BaseSettings):
@ -38,6 +38,8 @@ class Settings(BaseSettings):
    SMTP_PASSWORD: str

    LDAP_SERVER: str
+    LDAP_ENABLE: str
+
    @property
    def SQLALCHEMY_DATABASE_URI(self) -> str:
        return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"