Solved the ingestion issue for ocred pdf

2025-08-15 14:13:47 +00:00 · 2024-02-17 10:23:57 +05:45 · 2024-02-17 10:23:57 +05:45 · c8b39c898c
commit c8b39c898c
parent d849ee76f4
6 changed files with 103 additions and 75 deletions
--- a/.env
+++ b/.env
@ -4,7 +4,7 @@ ENVIRONMENT=dev
 DB_HOST=localhost
 DB_USER=postgres
 DB_PORT=5432
-DB_PASSWORD=admin
+DB_PASSWORD=quick
 DB_NAME=QuickGpt
 SUPER_ADMIN_EMAIL=superadmin@email.com
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
 SMTP_USERNAME=shresthasaurab030
 SMTP_PASSWORD=huurxwxeorxjorzw
-LDAP_SERVER=ldap://192.168.101.111
+LDAP_SERVER=ldap://192.168.101.111
 LDAP_ENABLE=False
--- a/private_gpt/components/ocr_components/table_ocr_api.py
+++ b/private_gpt/components/ocr_components/table_ocr_api.py
@ -1,64 +1,87 @@
 from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from docx import Document
 import os
 import fitz
 import requests
 from docx import Document
 from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
 from sqlalchemy.orm import Session
 from private_gpt.users import models
 from private_gpt.users.api import deps
 from private_gpt.users.constants.role import Role
 from private_gpt.components.ocr_components.TextExtraction import ImageToTable
 from private_gpt.components.ocr_components.table_ocr import GetOCRText
-from private_gpt.server.ingest.ingest_router import ingest_file
+from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
-upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
+from private_gpt.constants import OCR_UPLOAD
 pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])
 pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
@pdf_router.post("/pdf_ocr")
-async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
+async def get_pdf_ocr(
-    UPLOAD_DIR = upload_dir
+    request: Request,
    db: Session = Depends(deps.get_db),
    file: UploadFile = File(...),
    current_user: models.User = Security(
        deps.get_current_user,
        scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
    )
 ):
    UPLOAD_DIR = OCR_UPLOAD
    try:
        contents = await file.read()
-    except Exception:
+    except Exception as e:
-        return {"message": "There was an error uploading the file"}
+        raise HTTPException(
-    
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-    # Save the uploaded file to the dir
+            detail=f"There was an error reading the file: {e}"
        )
    file_path = os.path.join(UPLOAD_DIR, file.filename)
    with open(file_path, "wb") as f:
        f.write(contents)
    doc = Document()
-    ocr = GetOCRText() 
+    ocr = GetOCRText()
    img_tab = ImageToTable()
    pdf_doc = fitz.open(file_path)
-    for page_index in range(len(pdf_doc)): # iterate over pdf pages
+    # try:
-        page = pdf_doc[page_index] # get the page
+    for page_index in range(len(pdf_doc)):
        page = pdf_doc[page_index]
        image_list = page.get_images()
-        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
+        if not image_list:
            continue
        for image_index, img in enumerate(image_list, start=1):
            xref = img[0]
            pix = fitz.Pixmap(pdf_doc, xref)
            if pix.n - pix.alpha > 3:
-                pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples)
+                pix = fitz.Pixmap(fitz.csRGB, pix)(
-            image_path = "page_%s-image_%s.png" % (page_index, image_index)
+                    "RGB", [pix.width, pix.height], pix.samples)
-            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
+
-            pixs = None
+            image_path = f"page_{page_index}-image_{image_index}.png"
-            extracted_text = ocr.extract_text(image_file=True, file_path=image_path)
+            pix.save(image_path)
            extracted_text = ocr.extract_text(
                image_file=True, file_path=image_path)
            doc.add_paragraph(extracted_text)
            table_data = img_tab.table_to_csv(image_path)
            print(table_data)
            doc.add_paragraph(table_data)
-            # remove image file
+            os.remove(image_path) 
-    save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
+    save_path = os.path.join(
        UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
    doc.save(save_path)
-    with open(save_path,'rb') as f:
+    with open(save_path, 'rb') as f:
        file_content = f.read()
-        starfleet_data = {
+        if not file_content:
-            "filename": f.name,
+            raise HTTPException(
-            "file_content": file_content,
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            "file_type": "multipart/form-data"
+                detail="Empty file content after processing OCR"
-        }
+            )
-    requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
+    ingested_documents = await common_ingest_logic(
-    return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
+        request=request,db=db, ocr_file=save_path, current_user=current_user
-
+    )
    return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
--- a/private_gpt/constants.py
+++ b/private_gpt/constants.py
@ -3,4 +3,6 @@ from pathlib import Path
 PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
 script_dir = os.path.dirname(os.path.abspath(__file__))
-UPLOAD_DIR = os.path.join(script_dir, "static")
+UPLOAD_DIR = os.path.join(script_dir, "static")  # Actual upload path for uploaded file
 OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file
--- a/private_gpt/server/ingest/ingest_router.py
+++ b/private_gpt/server/ingest/ingest_router.py
@ -223,48 +223,48 @@ def ingest_file(
        )
-
+async def common_ingest_logic(
-
+    request: Request,
-def ingest_pdf_file(
+    db: Session,
-        request: Request,
+    ocr_file,
-        db: Session = Depends(deps.get_db),
+    current_user,
-        file: UploadFile = File(...),
+):
 ) -> IngestResponse:
    """Ingests and processes a file, storing its chunks to be used as context."""
    service = request.state.injector.get(IngestService)
    try:
-        file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
+        with open(ocr_file, 'rb') as file:
-        if file_ingested:
+            file_name = Path(ocr_file).name
-            raise HTTPException(
+            upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
                status_code=status.HTTP_409_CONFLICT,
                detail="File already exists. Choose a different file.",
            )
        if file.filename is None:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="No file name provided",
            )
-        try:
+            file_ingested = crud.documents.get_by_filename(
-            docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
+                db, file_name=file_name)
            if file_ingested:
                raise HTTPException(
                    status_code=409,
                    detail="File already exists. Choose a different file.",
                )
            if file_name is None:
                raise HTTPException(
                    status_code=400,
                    detail="No file name provided",
                )
            docs_in = schemas.DocumentCreate(
                filename=file_name, uploaded_by=current_user.id)
            crud.documents.create(db=db, obj_in=docs_in)
        except Exception as e:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail="Unable to upload file.",
            )
        upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
-        with open(upload_path, "wb") as f:
+            with open(upload_path, "wb") as f:
-            f.write(file.file.read())
+                f.write(file.read())
-        with open(upload_path, "rb") as f:
+            # Ingest binary data
-            ingested_documents = service.ingest_bin_data(file.filename, f)
+            file.seek(0)  # Move the file pointer back to the beginning
-        logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
+            ingested_documents = service.ingest_bin_data(file_name, file)
        logger.info(
            f"{file_name} is uploaded by the {current_user.fullname}.")
        return ingested_documents
        return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
    except HTTPException:
        raise
@ -272,6 +272,6 @@ def ingest_pdf_file(
        logger.error(f"There was an error uploading the file(s): {str(e)}")
        print("ERROR: ", e)
        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            status_code=500,
            detail="Internal Server Error: Unable to ingest file.",
        )
--- a/private_gpt/users/api/v1/routers/auth.py
+++ b/private_gpt/users/api/v1/routers/auth.py
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
 from private_gpt.users.utils import send_registration_email, Ldap
 LDAP_SERVER = settings.LDAP_SERVER
-LDAP_ENABLE = False
+LDAP_ENABLE = settings.LDAP_ENABLE
 router = APIRouter(prefix="/auth", tags=["auth"])
--- a/private_gpt/users/core/config.py
+++ b/private_gpt/users/core/config.py
@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
    port='5432',
    db_name='QuickGpt',
    username='postgres',
-    password="admin",
+    password="quick",
 )
 class Settings(BaseSettings):
@ -38,6 +38,8 @@ class Settings(BaseSettings):
    SMTP_PASSWORD: str
    LDAP_SERVER: str
    LDAP_ENABLE: str
    @property
    def SQLALCHEMY_DATABASE_URI(self) -> str:
        return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"