Solved the ingestion issue for ocred pdf

This commit is contained in:
Saurab-Shrestha 2024-02-17 10:23:57 +05:45
parent d849ee76f4
commit c8b39c898c
6 changed files with 103 additions and 75 deletions

5
.env
View File

@ -4,7 +4,7 @@ ENVIRONMENT=dev
DB_HOST=localhost DB_HOST=localhost
DB_USER=postgres DB_USER=postgres
DB_PORT=5432 DB_PORT=5432
DB_PASSWORD=admin DB_PASSWORD=quick
DB_NAME=QuickGpt DB_NAME=QuickGpt
SUPER_ADMIN_EMAIL=superadmin@email.com SUPER_ADMIN_EMAIL=superadmin@email.com
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
SMTP_USERNAME=shresthasaurab030 SMTP_USERNAME=shresthasaurab030
SMTP_PASSWORD=huurxwxeorxjorzw SMTP_PASSWORD=huurxwxeorxjorzw
LDAP_SERVER=ldap://192.168.101.111 LDAP_SERVER=ldap://192.168.101.111
LDAP_ENABLE=False

View File

@ -1,64 +1,87 @@
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from docx import Document
import os import os
import fitz import fitz
import requests import requests
from docx import Document
from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
from sqlalchemy.orm import Session
from private_gpt.users import models
from private_gpt.users.api import deps
from private_gpt.users.constants.role import Role
from private_gpt.components.ocr_components.TextExtraction import ImageToTable from private_gpt.components.ocr_components.TextExtraction import ImageToTable
from private_gpt.components.ocr_components.table_ocr import GetOCRText from private_gpt.components.ocr_components.table_ocr import GetOCRText
from private_gpt.server.ingest.ingest_router import ingest_file from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads" from private_gpt.constants import OCR_UPLOAD
pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
@pdf_router.post("/pdf_ocr") @pdf_router.post("/pdf_ocr")
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)): async def get_pdf_ocr(
UPLOAD_DIR = upload_dir request: Request,
db: Session = Depends(deps.get_db),
file: UploadFile = File(...),
current_user: models.User = Security(
deps.get_current_user,
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
)
):
UPLOAD_DIR = OCR_UPLOAD
try: try:
contents = await file.read() contents = await file.read()
except Exception: except Exception as e:
return {"message": "There was an error uploading the file"} raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
# Save the uploaded file to the dir detail=f"There was an error reading the file: {e}"
)
file_path = os.path.join(UPLOAD_DIR, file.filename) file_path = os.path.join(UPLOAD_DIR, file.filename)
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(contents) f.write(contents)
doc = Document() doc = Document()
ocr = GetOCRText() ocr = GetOCRText()
img_tab = ImageToTable() img_tab = ImageToTable()
pdf_doc = fitz.open(file_path) pdf_doc = fitz.open(file_path)
for page_index in range(len(pdf_doc)): # iterate over pdf pages # try:
page = pdf_doc[page_index] # get the page for page_index in range(len(pdf_doc)):
page = pdf_doc[page_index]
image_list = page.get_images() image_list = page.get_images()
for image_index, img in enumerate(image_list, start=1): # enumerate the image list if not image_list:
continue
for image_index, img in enumerate(image_list, start=1):
xref = img[0] xref = img[0]
pix = fitz.Pixmap(pdf_doc, xref) pix = fitz.Pixmap(pdf_doc, xref)
if pix.n - pix.alpha > 3: if pix.n - pix.alpha > 3:
pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples) pix = fitz.Pixmap(fitz.csRGB, pix)(
image_path = "page_%s-image_%s.png" % (page_index, image_index) "RGB", [pix.width, pix.height], pix.samples)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pixs = None image_path = f"page_{page_index}-image_{image_index}.png"
extracted_text = ocr.extract_text(image_file=True, file_path=image_path) pix.save(image_path)
extracted_text = ocr.extract_text(
image_file=True, file_path=image_path)
doc.add_paragraph(extracted_text) doc.add_paragraph(extracted_text)
table_data = img_tab.table_to_csv(image_path) table_data = img_tab.table_to_csv(image_path)
print(table_data)
doc.add_paragraph(table_data) doc.add_paragraph(table_data)
# remove image file os.remove(image_path)
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx") save_path = os.path.join(
UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
doc.save(save_path) doc.save(save_path)
with open(save_path,'rb') as f: with open(save_path, 'rb') as f:
file_content = f.read() file_content = f.read()
starfleet_data = { if not file_content:
"filename": f.name, raise HTTPException(
"file_content": file_content, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
"file_type": "multipart/form-data" detail="Empty file content after processing OCR"
} )
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"}) ingested_documents = await common_ingest_logic(
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf") request=request,db=db, ocr_file=save_path, current_user=current_user
)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)

View File

@ -3,4 +3,6 @@ from pathlib import Path
PROJECT_ROOT_PATH: Path = Path(__file__).parents[1] PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
UPLOAD_DIR = os.path.join(script_dir, "static") UPLOAD_DIR = os.path.join(script_dir, "static") # Actual upload path for uploaded file
OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file

View File

@ -223,48 +223,48 @@ def ingest_file(
) )
async def common_ingest_logic(
request: Request,
def ingest_pdf_file( db: Session,
request: Request, ocr_file,
db: Session = Depends(deps.get_db), current_user,
file: UploadFile = File(...), ):
) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context."""
service = request.state.injector.get(IngestService) service = request.state.injector.get(IngestService)
try: try:
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename) with open(ocr_file, 'rb') as file:
if file_ingested: file_name = Path(ocr_file).name
raise HTTPException( upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
status_code=status.HTTP_409_CONFLICT,
detail="File already exists. Choose a different file.",
)
if file.filename is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No file name provided",
)
try: file_ingested = crud.documents.get_by_filename(
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id) db, file_name=file_name)
if file_ingested:
raise HTTPException(
status_code=409,
detail="File already exists. Choose a different file.",
)
if file_name is None:
raise HTTPException(
status_code=400,
detail="No file name provided",
)
docs_in = schemas.DocumentCreate(
filename=file_name, uploaded_by=current_user.id)
crud.documents.create(db=db, obj_in=docs_in) crud.documents.create(db=db, obj_in=docs_in)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Unable to upload file.",
)
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
with open(upload_path, "wb") as f: with open(upload_path, "wb") as f:
f.write(file.file.read()) f.write(file.read())
with open(upload_path, "rb") as f: # Ingest binary data
ingested_documents = service.ingest_bin_data(file.filename, f) file.seek(0) # Move the file pointer back to the beginning
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.") ingested_documents = service.ingest_bin_data(file_name, file)
logger.info(
f"{file_name} is uploaded by the {current_user.fullname}.")
return ingested_documents
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
except HTTPException: except HTTPException:
raise raise
@ -272,6 +272,6 @@ def ingest_pdf_file(
logger.error(f"There was an error uploading the file(s): {str(e)}") logger.error(f"There was an error uploading the file(s): {str(e)}")
print("ERROR: ", e) print("ERROR: ", e)
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=500,
detail="Internal Server Error: Unable to ingest file.", detail="Internal Server Error: Unable to ingest file.",
) )

View File

@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
from private_gpt.users.utils import send_registration_email, Ldap from private_gpt.users.utils import send_registration_email, Ldap
LDAP_SERVER = settings.LDAP_SERVER LDAP_SERVER = settings.LDAP_SERVER
LDAP_ENABLE = False LDAP_ENABLE = settings.LDAP_ENABLE
router = APIRouter(prefix="/auth", tags=["auth"]) router = APIRouter(prefix="/auth", tags=["auth"])

View File

@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
port='5432', port='5432',
db_name='QuickGpt', db_name='QuickGpt',
username='postgres', username='postgres',
password="admin", password="quick",
) )
class Settings(BaseSettings): class Settings(BaseSettings):
@ -38,6 +38,8 @@ class Settings(BaseSettings):
SMTP_PASSWORD: str SMTP_PASSWORD: str
LDAP_SERVER: str LDAP_SERVER: str
LDAP_ENABLE: str
@property @property
def SQLALCHEMY_DATABASE_URI(self) -> str: def SQLALCHEMY_DATABASE_URI(self) -> str:
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}" return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"