Solved the ingestion issue for ocred pdf

This commit is contained in:
Saurab-Shrestha 2024-02-17 10:23:57 +05:45
parent d849ee76f4
commit c8b39c898c
6 changed files with 103 additions and 75 deletions

5
.env
View File

@ -4,7 +4,7 @@ ENVIRONMENT=dev
DB_HOST=localhost
DB_USER=postgres
DB_PORT=5432
DB_PASSWORD=admin
DB_PASSWORD=quick
DB_NAME=QuickGpt
SUPER_ADMIN_EMAIL=superadmin@email.com
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
SMTP_USERNAME=shresthasaurab030
SMTP_PASSWORD=huurxwxeorxjorzw
LDAP_SERVER=ldap://192.168.101.111
LDAP_SERVER=ldap://192.168.101.111
LDAP_ENABLE=False

View File

@ -1,64 +1,87 @@
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
from fastapi.responses import FileResponse
from pydantic import BaseModel
from docx import Document
import os
import fitz
import requests
from docx import Document
from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
from sqlalchemy.orm import Session
from private_gpt.users import models
from private_gpt.users.api import deps
from private_gpt.users.constants.role import Role
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
from private_gpt.components.ocr_components.table_ocr import GetOCRText
from private_gpt.server.ingest.ingest_router import ingest_file
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
from private_gpt.constants import OCR_UPLOAD
pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
@pdf_router.post("/pdf_ocr")
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
UPLOAD_DIR = upload_dir
async def get_pdf_ocr(
request: Request,
db: Session = Depends(deps.get_db),
file: UploadFile = File(...),
current_user: models.User = Security(
deps.get_current_user,
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
)
):
UPLOAD_DIR = OCR_UPLOAD
try:
contents = await file.read()
except Exception:
return {"message": "There was an error uploading the file"}
# Save the uploaded file to the dir
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"There was an error reading the file: {e}"
)
file_path = os.path.join(UPLOAD_DIR, file.filename)
with open(file_path, "wb") as f:
f.write(contents)
doc = Document()
ocr = GetOCRText()
ocr = GetOCRText()
img_tab = ImageToTable()
pdf_doc = fitz.open(file_path)
for page_index in range(len(pdf_doc)): # iterate over pdf pages
page = pdf_doc[page_index] # get the page
# try:
for page_index in range(len(pdf_doc)):
page = pdf_doc[page_index]
image_list = page.get_images()
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
if not image_list:
continue
for image_index, img in enumerate(image_list, start=1):
xref = img[0]
pix = fitz.Pixmap(pdf_doc, xref)
if pix.n - pix.alpha > 3:
pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples)
image_path = "page_%s-image_%s.png" % (page_index, image_index)
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
pixs = None
extracted_text = ocr.extract_text(image_file=True, file_path=image_path)
pix = fitz.Pixmap(fitz.csRGB, pix)(
"RGB", [pix.width, pix.height], pix.samples)
image_path = f"page_{page_index}-image_{image_index}.png"
pix.save(image_path)
extracted_text = ocr.extract_text(
image_file=True, file_path=image_path)
doc.add_paragraph(extracted_text)
table_data = img_tab.table_to_csv(image_path)
print(table_data)
doc.add_paragraph(table_data)
# remove image file
os.remove(image_path)
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
save_path = os.path.join(
UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
doc.save(save_path)
with open(save_path,'rb') as f:
with open(save_path, 'rb') as f:
file_content = f.read()
starfleet_data = {
"filename": f.name,
"file_content": file_content,
"file_type": "multipart/form-data"
}
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
if not file_content:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Empty file content after processing OCR"
)
ingested_documents = await common_ingest_logic(
request=request,db=db, ocr_file=save_path, current_user=current_user
)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)

View File

@ -3,4 +3,6 @@ from pathlib import Path
PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
script_dir = os.path.dirname(os.path.abspath(__file__))
UPLOAD_DIR = os.path.join(script_dir, "static")
UPLOAD_DIR = os.path.join(script_dir, "static") # Actual upload path for uploaded file
OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file

View File

@ -223,48 +223,48 @@ def ingest_file(
)
def ingest_pdf_file(
request: Request,
db: Session = Depends(deps.get_db),
file: UploadFile = File(...),
) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context."""
async def common_ingest_logic(
request: Request,
db: Session,
ocr_file,
current_user,
):
service = request.state.injector.get(IngestService)
try:
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
if file_ingested:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="File already exists. Choose a different file.",
)
if file.filename is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No file name provided",
)
with open(ocr_file, 'rb') as file:
file_name = Path(ocr_file).name
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
try:
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
file_ingested = crud.documents.get_by_filename(
db, file_name=file_name)
if file_ingested:
raise HTTPException(
status_code=409,
detail="File already exists. Choose a different file.",
)
if file_name is None:
raise HTTPException(
status_code=400,
detail="No file name provided",
)
docs_in = schemas.DocumentCreate(
filename=file_name, uploaded_by=current_user.id)
crud.documents.create(db=db, obj_in=docs_in)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Unable to upload file.",
)
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
with open(upload_path, "wb") as f:
f.write(file.file.read())
with open(upload_path, "wb") as f:
f.write(file.read())
with open(upload_path, "rb") as f:
ingested_documents = service.ingest_bin_data(file.filename, f)
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
# Ingest binary data
file.seek(0) # Move the file pointer back to the beginning
ingested_documents = service.ingest_bin_data(file_name, file)
logger.info(
f"{file_name} is uploaded by the {current_user.fullname}.")
return ingested_documents
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
except HTTPException:
raise
@ -272,6 +272,6 @@ def ingest_pdf_file(
logger.error(f"There was an error uploading the file(s): {str(e)}")
print("ERROR: ", e)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
status_code=500,
detail="Internal Server Error: Unable to ingest file.",
)

View File

@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
from private_gpt.users.utils import send_registration_email, Ldap
LDAP_SERVER = settings.LDAP_SERVER
LDAP_ENABLE = False
LDAP_ENABLE = settings.LDAP_ENABLE
router = APIRouter(prefix="/auth", tags=["auth"])

View File

@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
port='5432',
db_name='QuickGpt',
username='postgres',
password="admin",
password="quick",
)
class Settings(BaseSettings):
@ -38,6 +38,8 @@ class Settings(BaseSettings):
SMTP_PASSWORD: str
LDAP_SERVER: str
LDAP_ENABLE: str
@property
def SQLALCHEMY_DATABASE_URI(self) -> str:
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"