mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-30 01:02:02 +00:00
Solved the ingestion issue for ocred pdf
This commit is contained in:
parent
d849ee76f4
commit
c8b39c898c
5
.env
5
.env
@ -4,7 +4,7 @@ ENVIRONMENT=dev
|
|||||||
DB_HOST=localhost
|
DB_HOST=localhost
|
||||||
DB_USER=postgres
|
DB_USER=postgres
|
||||||
DB_PORT=5432
|
DB_PORT=5432
|
||||||
DB_PASSWORD=admin
|
DB_PASSWORD=quick
|
||||||
DB_NAME=QuickGpt
|
DB_NAME=QuickGpt
|
||||||
|
|
||||||
SUPER_ADMIN_EMAIL=superadmin@email.com
|
SUPER_ADMIN_EMAIL=superadmin@email.com
|
||||||
@ -21,4 +21,5 @@ SMTP_SENDER_EMAIL=shresthasaurab030@outlook.com
|
|||||||
SMTP_USERNAME=shresthasaurab030
|
SMTP_USERNAME=shresthasaurab030
|
||||||
SMTP_PASSWORD=huurxwxeorxjorzw
|
SMTP_PASSWORD=huurxwxeorxjorzw
|
||||||
|
|
||||||
LDAP_SERVER=ldap://192.168.101.111
|
LDAP_SERVER=ldap://192.168.101.111
|
||||||
|
LDAP_ENABLE=False
|
@ -1,64 +1,87 @@
|
|||||||
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
|
|
||||||
from fastapi.responses import FileResponse
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from docx import Document
|
|
||||||
import os
|
import os
|
||||||
import fitz
|
import fitz
|
||||||
import requests
|
import requests
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
from fastapi import HTTPException, status, File, UploadFile, APIRouter, Request, Security, Depends
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from private_gpt.users import models
|
||||||
|
from private_gpt.users.api import deps
|
||||||
|
from private_gpt.users.constants.role import Role
|
||||||
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
||||||
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
||||||
from private_gpt.server.ingest.ingest_router import ingest_file
|
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
|
||||||
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
|
from private_gpt.constants import OCR_UPLOAD
|
||||||
|
|
||||||
|
|
||||||
|
pdf_router = APIRouter(prefix="/pdf", tags=["ocr"])
|
||||||
|
|
||||||
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
|
|
||||||
|
|
||||||
@pdf_router.post("/pdf_ocr")
|
@pdf_router.post("/pdf_ocr")
|
||||||
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
|
async def get_pdf_ocr(
|
||||||
UPLOAD_DIR = upload_dir
|
request: Request,
|
||||||
|
db: Session = Depends(deps.get_db),
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
current_user: models.User = Security(
|
||||||
|
deps.get_current_user,
|
||||||
|
scopes=[Role.ADMIN["name"], Role.SUPER_ADMIN["name"]],
|
||||||
|
)
|
||||||
|
):
|
||||||
|
UPLOAD_DIR = OCR_UPLOAD
|
||||||
try:
|
try:
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return {"message": "There was an error uploading the file"}
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
# Save the uploaded file to the dir
|
detail=f"There was an error reading the file: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
||||||
with open(file_path, "wb") as f:
|
with open(file_path, "wb") as f:
|
||||||
f.write(contents)
|
f.write(contents)
|
||||||
|
|
||||||
doc = Document()
|
doc = Document()
|
||||||
ocr = GetOCRText()
|
ocr = GetOCRText()
|
||||||
img_tab = ImageToTable()
|
img_tab = ImageToTable()
|
||||||
pdf_doc = fitz.open(file_path)
|
pdf_doc = fitz.open(file_path)
|
||||||
for page_index in range(len(pdf_doc)): # iterate over pdf pages
|
# try:
|
||||||
page = pdf_doc[page_index] # get the page
|
for page_index in range(len(pdf_doc)):
|
||||||
|
page = pdf_doc[page_index]
|
||||||
image_list = page.get_images()
|
image_list = page.get_images()
|
||||||
|
|
||||||
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
|
if not image_list:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for image_index, img in enumerate(image_list, start=1):
|
||||||
xref = img[0]
|
xref = img[0]
|
||||||
pix = fitz.Pixmap(pdf_doc, xref)
|
pix = fitz.Pixmap(pdf_doc, xref)
|
||||||
|
|
||||||
if pix.n - pix.alpha > 3:
|
if pix.n - pix.alpha > 3:
|
||||||
pix = fitz.Pixmap(fitz.csRGB, pix)("RGB", [pix.width, pix.height], pix.samples)
|
pix = fitz.Pixmap(fitz.csRGB, pix)(
|
||||||
image_path = "page_%s-image_%s.png" % (page_index, image_index)
|
"RGB", [pix.width, pix.height], pix.samples)
|
||||||
pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
|
|
||||||
pixs = None
|
image_path = f"page_{page_index}-image_{image_index}.png"
|
||||||
extracted_text = ocr.extract_text(image_file=True, file_path=image_path)
|
pix.save(image_path)
|
||||||
|
extracted_text = ocr.extract_text(
|
||||||
|
image_file=True, file_path=image_path)
|
||||||
doc.add_paragraph(extracted_text)
|
doc.add_paragraph(extracted_text)
|
||||||
table_data = img_tab.table_to_csv(image_path)
|
table_data = img_tab.table_to_csv(image_path)
|
||||||
print(table_data)
|
|
||||||
doc.add_paragraph(table_data)
|
doc.add_paragraph(table_data)
|
||||||
# remove image file
|
os.remove(image_path)
|
||||||
|
|
||||||
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
|
save_path = os.path.join(
|
||||||
|
UPLOAD_DIR, f"{file.filename.replace('.pdf', '_ocr.docx')}")
|
||||||
doc.save(save_path)
|
doc.save(save_path)
|
||||||
|
|
||||||
with open(save_path,'rb') as f:
|
with open(save_path, 'rb') as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
starfleet_data = {
|
if not file_content:
|
||||||
"filename": f.name,
|
raise HTTPException(
|
||||||
"file_content": file_content,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
"file_type": "multipart/form-data"
|
detail="Empty file content after processing OCR"
|
||||||
}
|
)
|
||||||
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
|
ingested_documents = await common_ingest_logic(
|
||||||
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
|
request=request,db=db, ocr_file=save_path, current_user=current_user
|
||||||
|
)
|
||||||
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||||
|
@ -3,4 +3,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
|
PROJECT_ROOT_PATH: Path = Path(__file__).parents[1]
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
UPLOAD_DIR = os.path.join(script_dir, "static")
|
UPLOAD_DIR = os.path.join(script_dir, "static") # Actual upload path for uploaded file
|
||||||
|
|
||||||
|
OCR_UPLOAD = os.path.join(script_dir, 'uploads') # temporary upload path for scanned pdf file
|
||||||
|
@ -223,48 +223,48 @@ def ingest_file(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def common_ingest_logic(
|
||||||
|
request: Request,
|
||||||
def ingest_pdf_file(
|
db: Session,
|
||||||
request: Request,
|
ocr_file,
|
||||||
db: Session = Depends(deps.get_db),
|
current_user,
|
||||||
file: UploadFile = File(...),
|
):
|
||||||
) -> IngestResponse:
|
|
||||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
|
||||||
service = request.state.injector.get(IngestService)
|
service = request.state.injector.get(IngestService)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
with open(ocr_file, 'rb') as file:
|
||||||
if file_ingested:
|
file_name = Path(ocr_file).name
|
||||||
raise HTTPException(
|
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
||||||
status_code=status.HTTP_409_CONFLICT,
|
|
||||||
detail="File already exists. Choose a different file.",
|
|
||||||
)
|
|
||||||
|
|
||||||
if file.filename is None:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail="No file name provided",
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
file_ingested = crud.documents.get_by_filename(
|
||||||
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
|
db, file_name=file_name)
|
||||||
|
if file_ingested:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=409,
|
||||||
|
detail="File already exists. Choose a different file.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_name is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="No file name provided",
|
||||||
|
)
|
||||||
|
|
||||||
|
docs_in = schemas.DocumentCreate(
|
||||||
|
filename=file_name, uploaded_by=current_user.id)
|
||||||
crud.documents.create(db=db, obj_in=docs_in)
|
crud.documents.create(db=db, obj_in=docs_in)
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail="Unable to upload file.",
|
|
||||||
)
|
|
||||||
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
|
|
||||||
|
|
||||||
with open(upload_path, "wb") as f:
|
with open(upload_path, "wb") as f:
|
||||||
f.write(file.file.read())
|
f.write(file.read())
|
||||||
|
|
||||||
with open(upload_path, "rb") as f:
|
# Ingest binary data
|
||||||
ingested_documents = service.ingest_bin_data(file.filename, f)
|
file.seek(0) # Move the file pointer back to the beginning
|
||||||
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
|
ingested_documents = service.ingest_bin_data(file_name, file)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"{file_name} is uploaded by the {current_user.fullname}.")
|
||||||
|
|
||||||
|
return ingested_documents
|
||||||
|
|
||||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -272,6 +272,6 @@ def ingest_pdf_file(
|
|||||||
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
||||||
print("ERROR: ", e)
|
print("ERROR: ", e)
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=500,
|
||||||
detail="Internal Server Error: Unable to ingest file.",
|
detail="Internal Server Error: Unable to ingest file.",
|
||||||
)
|
)
|
||||||
|
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
|
|||||||
from private_gpt.users.utils import send_registration_email, Ldap
|
from private_gpt.users.utils import send_registration_email, Ldap
|
||||||
|
|
||||||
LDAP_SERVER = settings.LDAP_SERVER
|
LDAP_SERVER = settings.LDAP_SERVER
|
||||||
LDAP_ENABLE = False
|
LDAP_ENABLE = settings.LDAP_ENABLE
|
||||||
|
|
||||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://{username}:{password}@{host}:{p
|
|||||||
port='5432',
|
port='5432',
|
||||||
db_name='QuickGpt',
|
db_name='QuickGpt',
|
||||||
username='postgres',
|
username='postgres',
|
||||||
password="admin",
|
password="quick",
|
||||||
)
|
)
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
@ -38,6 +38,8 @@ class Settings(BaseSettings):
|
|||||||
SMTP_PASSWORD: str
|
SMTP_PASSWORD: str
|
||||||
|
|
||||||
LDAP_SERVER: str
|
LDAP_SERVER: str
|
||||||
|
LDAP_ENABLE: str
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def SQLALCHEMY_DATABASE_URI(self) -> str:
|
def SQLALCHEMY_DATABASE_URI(self) -> str:
|
||||||
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASSWORD}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||||
|
Loading…
Reference in New Issue
Block a user