Added routes for pdf ocr

This commit is contained in:
Saurab-Shrestha 2024-02-15 17:39:07 +05:45
parent 91ebce47d4
commit d849ee76f4
9 changed files with 1519 additions and 541 deletions

View File

@ -1,42 +0,0 @@
"""Create document
Revision ID: a5839618494c
Revises:
Create Date: 2024-02-11 12:35:13.347853
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'a5839618494c'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('document',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('filename', sa.String(length=225), nullable=False),
sa.Column('uploaded_by', sa.Integer(), nullable=False),
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('filename')
)
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
op.drop_index(op.f('ix_document_id'), table_name='document')
op.drop_table('document')
# ### end Alembic commands ###

View File

@ -0,0 +1,100 @@
"""Create models
Revision ID: dcf96cb11a85
Revises:
Create Date: 2024-02-14 16:30:51.094285
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'dcf96cb11a85'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('companies',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
op.create_table('roles',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(length=100), nullable=True),
sa.Column('description', sa.Text(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
op.create_table('subscriptions',
sa.Column('sub_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('start_date', sa.DateTime(), nullable=True),
sa.Column('end_date', sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('sub_id')
)
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
op.create_table('users',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('email', sa.String(length=225), nullable=False),
sa.Column('hashed_password', sa.String(), nullable=False),
sa.Column('fullname', sa.String(length=225), nullable=False),
sa.Column('is_active', sa.Boolean(), nullable=True),
sa.Column('last_login', sa.DateTime(), nullable=True),
sa.Column('created_at', sa.DateTime(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=True),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('email'),
sa.UniqueConstraint('fullname'),
sa.UniqueConstraint('fullname', name='unique_username_no_spacing')
)
op.create_table('document',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('filename', sa.String(length=225), nullable=False),
sa.Column('uploaded_by', sa.Integer(), nullable=False),
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('filename')
)
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
op.create_table('user_roles',
sa.Column('user_id', sa.Integer(), nullable=False),
sa.Column('role_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
)
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('user_roles')
op.drop_index(op.f('ix_document_id'), table_name='document')
op.drop_table('document')
op.drop_table('users')
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
op.drop_table('subscriptions')
op.drop_index(op.f('ix_roles_name'), table_name='roles')
op.drop_index(op.f('ix_roles_id'), table_name='roles')
op.drop_table('roles')
op.drop_index(op.f('ix_companies_name'), table_name='companies')
op.drop_index(op.f('ix_companies_id'), table_name='companies')
op.drop_table('companies')
# ### end Alembic commands ###

1806
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,19 +1,19 @@
from fastapi import FastAPI, File, UploadFile, Response, APIRouter from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from pydantic import BaseModel from pydantic import BaseModel
from docx import Document from docx import Document
import os import os
import fitz import fitz
import requests
from private_gpt.components.ocr_components.TextExtraction import ImageToTable from private_gpt.components.ocr_components.TextExtraction import ImageToTable
from private_gpt.components.ocr_components.table_ocr import GetOCRText from private_gpt.components.ocr_components.table_ocr import GetOCRText
from private_gpt.server.ingest.ingest_router import ingest_file
upload_dir = rf"F:\LLM\privateGPT\private_gpt\uploads" upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
pdf_router = APIRouter(prefix="/pdf", tags=["auth"]) pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
@pdf_router.post("/pdf_ocr") @pdf_router.post("/pdf_ocr")
async def get_pdf_ocr(file: UploadFile = File(...)): async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
UPLOAD_DIR = upload_dir UPLOAD_DIR = upload_dir
try: try:
contents = await file.read() contents = await file.read()
@ -49,7 +49,16 @@ async def get_pdf_ocr(file: UploadFile = File(...)):
doc.add_paragraph(table_data) doc.add_paragraph(table_data)
# remove image file # remove image file
doc.save(os.path.join(UPLOAD_DIR, "ocr_result.docx")) save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
doc.save(save_path)
with open(save_path,'rb') as f:
file_content = f.read()
starfleet_data = {
"filename": f.name,
"file_content": file_content,
"file_type": "multipart/form-data"
}
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf") return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")

View File

@ -42,8 +42,10 @@ def create_app(root_injector: Injector) -> FastAPI:
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_credentials=True, allow_credentials=True,
allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80", allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80", "http://127.0.0.1",
"http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88", "http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173"], "http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88",
"http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173", "http://127.0.0.1/", "http://localhost/",
"http://localhost:80", "http://192.168.1.131:80/", "http://192.168.1.131"],
allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"], allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"],
allow_headers=["*"], allow_headers=["*"],
) )

View File

@ -178,6 +178,60 @@ def ingest_file(
)) -> IngestResponse: )) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context.""" """Ingests and processes a file, storing its chunks to be used as context."""
service = request.state.injector.get(IngestService) service = request.state.injector.get(IngestService)
print("-------------------------------------->",file)
try:
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
if file_ingested:
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="File already exists. Choose a different file.",
)
if file.filename is None:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="No file name provided",
)
try:
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
crud.documents.create(db=db, obj_in=docs_in)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Unable to upload file.",
)
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
with open(upload_path, "wb") as f:
f.write(file.file.read())
with open(upload_path, "rb") as f:
ingested_documents = service.ingest_bin_data(file.filename, f)
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
except HTTPException:
raise
except Exception as e:
logger.error(f"There was an error uploading the file(s): {str(e)}")
print("ERROR: ", e)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Internal Server Error: Unable to ingest file.",
)
def ingest_pdf_file(
request: Request,
db: Session = Depends(deps.get_db),
file: UploadFile = File(...),
) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context."""
service = request.state.injector.get(IngestService)
try: try:
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename) file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)

View File

@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
from private_gpt.users.utils import send_registration_email, Ldap from private_gpt.users.utils import send_registration_email, Ldap
LDAP_SERVER = settings.LDAP_SERVER LDAP_SERVER = settings.LDAP_SERVER
LDAP_ENABLE = True LDAP_ENABLE = False
router = APIRouter(prefix="/auth", tags=["auth"]) router = APIRouter(prefix="/auth", tags=["auth"])
@ -104,17 +104,17 @@ def login_access_token(
""" """
OAuth2 compatible token login, get an access token for future requests OAuth2 compatible token login, get an access token for future requests
""" """
if LDAP_ENABLE: # if LDAP_ENABLE:
existing_user = crud.user.get_by_email(db, email=form_data.username) # existing_user = crud.user.get_by_email(db, email=form_data.username)
if existing_user: # if existing_user:
if existing_user.user_role.role.name == "SUPER_ADMIN": # if existing_user.user_role.role.name == "SUPER_ADMIN":
pass # pass
else: # else:
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password) # ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
else: # else:
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password) # ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password) # ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
user = crud.user.authenticate( user = crud.user.authenticate(
db, email=form_data.username, password=form_data.password db, email=form_data.username, password=form_data.password
@ -212,7 +212,7 @@ def register(
detail="The user with this email already exists!", detail="The user with this email already exists!",
) )
random_password = security.generate_random_password() random_password = security.generate_random_password()
# random_password = password
try: try:
if company_id: if company_id:
# Registering user with a specific company # Registering user with a specific company

View File

@ -24,7 +24,10 @@ python-jose = "^3.3.0"
psycopg2-binary = "^2.9.9" psycopg2-binary = "^2.9.9"
passlib = "^1.7.4" passlib = "^1.7.4"
docx2txt = "^0.8" docx2txt = "^0.8"
torch = "2.1.2" gradio = "^4.18.0"
ldap3 = "^2.9.1"
uvicorn = "^0.27.1"
python-doctr = {extras = ["torch"], version = "^0.7.0"}
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = "^22" black = "^22"

BIN
req.txt Normal file

Binary file not shown.