mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-29 16:58:00 +00:00
Solved the ingestion issue for ocred pdf
This commit is contained in:
parent
d849ee76f4
commit
c8b39c898c
5
.env
5
.env
@ -4,7 +4,7 @@ ENVIRONMENT=dev
|
||||
DB_HOST=localhost
|
||||
DB_USER=postgres
|
||||
DB_PORT=5432
|
||||
DB_PASSWORD=admin
|
||||
DB_PASSWORD=quick
|
||||
DB_NAME=QuickGpt
|
||||
|
||||
SUPER_ADMIN_EMAIL=superadmin@email.com
|
||||
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
|
||||
SMTP_USERNAME=shresthasaurab030
|
||||
SMTP_PASSWORD=huurxwxeorxjorzw
|
||||
|
||||
LDAP_SERVER=ldap://192.168.101.111
|
||||
LDAP_SERVER=ldap://192.168.101.111
|
||||
LDAP_ENABLE=False
|
@ -1,64 +1,87 @@
|
||||
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
from docx import Document
|
||||
import os
|
||||
import fitz
|
||||
import requests
|
||||
from docx import Document
|
||||
|
||||
from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from private_gpt.users import models
|
||||
from private_gpt.users.api import deps
|
||||
from private_gpt.users.constants.role import Role
|
||||
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
||||
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
||||
from private_gpt.server.ingest.ingest_router import ingest_file
|
||||
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
|
||||
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
|
||||
from private_gpt.constants import OCR_UPLOAD
|
||||
|
||||
|
||||
pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])
|
||||
|
||||
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
|
||||
|
||||
@pdf_router.post("/pdf_ocr")
|
||||
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
|
||||
UPLOAD_DIR = upload_dir
|
||||
async def get_pdf_ocr(
|
||||
request: Request,
|
||||
db: Session = Depends(deps.get_db),
|
||||
file: UploadFile = File(...),
|
||||
current_user: models.User = Security(
|
||||
deps.get_current_user,
|
||||
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
|
||||
)
|
||||
):
|
||||
UPLOAD_DIR = OCR_UPLOAD
|
||||
try:
|
||||
contents = await file.read()
|
||||
except Exception:
|
||||
return {"message": "There was an error uploading the file"}
|
||||
|
||||
# Save the uploaded file to the dir
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"There was an error reading the file: {e}"
|
||||
)
|
||||
|
||||
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(contents)
|
||||
|
||||
doc = Document()
|
||||
ocr = GetOCRText()
|
||||
ocr = GetOCRText()
|
||||
img_tab = ImageToTable()
|
||||
pdf_doc = fitz.open(file_path)
|
||||
for page_index in range(len(pdf_doc)): # iterate over pdf pages
|
||||
page = pdf_doc[page_index] # get the page
|
||||
# try:
|
||||
for page_index in range(len(pdf_doc)):
|
||||
page = pdf_doc[page_index]
|
||||
image_list = page.get_images()
|
||||
|
||||
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
|
||||
if not image_list:
|
||||
continue
|
||||
|
||||
for image_index, img in enumerate(image_list, start=1):
|
||||
xref = img[0]
|
||||
pix = fitz.Pixmap(pdf_doc, xref)
|
||||
|
||||
if pix.n - pix.alpha > 3:
|
||||
pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples)
|
||||
image_path = "page_%s-image_%s.png" % (page_index, image_index)
|
||||
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
|
||||
pixs = None
|
||||
extracted_text = ocr.extract_text(image_file=True, file_path=image_path)
|
||||
pix = fitz.Pixmap(fitz.csRGB, pix)(
|
||||
"RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
image_path = f"page_{page_index}-image_{image_index}.png"
|
||||
pix.save(image_path)
|
||||
extracted_text = ocr.extract_text(
|
||||
image_file=True, file_path=image_path)
|
||||
doc.add_paragraph(extracted_text)
|
||||
table_data = img_tab.table_to_csv(image_path)
|
||||
print(table_data)
|
||||
doc.add_paragraph(table_data)
|
||||
# remove image file
|
||||
os.remove(image_path)
|
||||
|
||||
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
|
||||
save_path = os.path.join(
|
||||
UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
|
||||
doc.save(save_path)
|
||||
|
||||
with open(save_path,'rb') as f:
|
||||
with open(save_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
starfleet_data = {
|
||||
"filename": f.name,
|
||||
"file_content": file_content,
|
||||
"file_type": "multipart/form-data"
|
||||
}
|
||||
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
|
||||
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
|
||||
|
||||
if not file_content:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Empty file content after processing OCR"
|
||||
)
|
||||
ingested_documents = await common_ingest_logic(
|
||||
request=request,db=db, ocr_file=save_path, current_user=current_user
|
||||
)
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
|
@ -3,4 +3,6 @@ from pathlib import Path
|
||||
|
||||
PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
UPLOAD_DIR = os.path.join(script_dir, "static")
|
||||
UPLOAD_DIR = os.path.join(script_dir, "static") # Actual upload path for uploaded file
|
||||
|
||||
OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file
|
||||
|
@ -223,48 +223,48 @@ def ingest_file(
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
def ingest_pdf_file(
|
||||
request: Request,
|
||||
db: Session = Depends(deps.get_db),
|
||||
file: UploadFile = File(...),
|
||||
) -> IngestResponse:
|
||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||
async def common_ingest_logic(
|
||||
request: Request,
|
||||
db: Session,
|
||||
ocr_file,
|
||||
current_user,
|
||||
):
|
||||
service = request.state.injector.get(IngestService)
|
||||
|
||||
try:
|
||||
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
||||
if file_ingested:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail="File already exists. Choose a different file.",
|
||||
)
|
||||
|
||||
if file.filename is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="No file name provided",
|
||||
)
|
||||
with open(ocr_file, 'rb') as file:
|
||||
file_name = Path(ocr_file).name
|
||||
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
||||
|
||||
try:
|
||||
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
|
||||
file_ingested = crud.documents.get_by_filename(
|
||||
db, file_name=file_name)
|
||||
if file_ingested:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="File already exists. Choose a different file.",
|
||||
)
|
||||
|
||||
if file_name is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="No file name provided",
|
||||
)
|
||||
|
||||
docs_in = schemas.DocumentCreate(
|
||||
filename=file_name, uploaded_by=current_user.id)
|
||||
crud.documents.create(db=db, obj_in=docs_in)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Unable to upload file.",
|
||||
)
|
||||
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
|
||||
|
||||
with open(upload_path, "wb") as f:
|
||||
f.write(file.file.read())
|
||||
with open(upload_path, "wb") as f:
|
||||
f.write(file.read())
|
||||
|
||||
with open(upload_path, "rb") as f:
|
||||
ingested_documents = service.ingest_bin_data(file.filename, f)
|
||||
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
|
||||
# Ingest binary data
|
||||
file.seek(0) # Move the file pointer back to the beginning
|
||||
ingested_documents = service.ingest_bin_data(file_name, file)
|
||||
|
||||
logger.info(
|
||||
f"{file_name} is uploaded by the {current_user.fullname}.")
|
||||
|
||||
return ingested_documents
|
||||
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
@ -272,6 +272,6 @@ def ingest_pdf_file(
|
||||
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
||||
print("ERROR: ", e)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
status_code=500,
|
||||
detail="Internal Server Error: Unable to ingest file.",
|
||||
)
|
||||
|
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
|
||||
from private_gpt.users.utils import send_registration_email, Ldap
|
||||
|
||||
LDAP_SERVER = settings.LDAP_SERVER
|
||||
LDAP_ENABLE = False
|
||||
LDAP_ENABLE = settings.LDAP_ENABLE
|
||||
|
||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||
|
||||
|
@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
|
||||
port='5432',
|
||||
db_name='QuickGpt',
|
||||
username='postgres',
|
||||
password="admin",
|
||||
password="quick",
|
||||
)
|
||||
|
||||
class Settings(BaseSettings):
|
||||
@ -38,6 +38,8 @@ class Settings(BaseSettings):
|
||||
SMTP_PASSWORD: str
|
||||
|
||||
LDAP_SERVER: str
|
||||
LDAP_ENABLE: str
|
||||
|
||||
@property
|
||||
def SQLALCHEMY_DATABASE_URI(self) -> str:
|
||||
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||
|
Loading…
Reference in New Issue
Block a user