Added routes for pdf ocr

2025-08-22 09:17:04 +00:00 · 2024-02-15 17:39:07 +05:45 · 2024-02-15 17:39:07 +05:45 · d849ee76f4
commit d849ee76f4
parent 91ebce47d4
9 changed files with 1519 additions and 541 deletions
--- a/alembic/versions/a5839618494c_create_document.py
+++ b/alembic/versions/a5839618494c_create_document.py
@ -1,42 +0,0 @@
-"""Create document
-
-Revision ID: a5839618494c
-Revises: 
-Create Date: 2024-02-11 12:35:13.347853
-
-"""
-from typing import Sequence, Union
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision: str = 'a5839618494c'
-down_revision: Union[str, None] = None
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('document',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('filename', sa.String(length=225), nullable=False),
-    sa.Column('uploaded_by', sa.Integer(), nullable=False),
-    sa.Column('uploaded_at', sa.DateTime(), nullable=False),
-    sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('filename')
-    )
-    op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
-    # op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
-    # ### end Alembic commands ###
-
-
-def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
-    # op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
-    op.drop_index(op.f('ix_document_id'), table_name='document')
-    op.drop_table('document')
-    # ### end Alembic commands ###
--- a/alembic/versions/dcf96cb11a85_create_models.py
+++ b/alembic/versions/dcf96cb11a85_create_models.py
@ -0,0 +1,100 @@
+"""Create models
+
+Revision ID: dcf96cb11a85
+Revises: 
+Create Date: 2024-02-14 16:30:51.094285
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'dcf96cb11a85'
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('companies',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('name', sa.String(), nullable=True),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
+    op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
+    op.create_table('roles',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('name', sa.String(length=100), nullable=True),
+    sa.Column('description', sa.Text(), nullable=True),
+    sa.PrimaryKeyConstraint('id')
+    )
+    op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
+    op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
+    op.create_table('subscriptions',
+    sa.Column('sub_id', sa.Integer(), nullable=False),
+    sa.Column('company_id', sa.Integer(), nullable=True),
+    sa.Column('start_date', sa.DateTime(), nullable=True),
+    sa.Column('end_date', sa.DateTime(), nullable=True),
+    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
+    sa.PrimaryKeyConstraint('sub_id')
+    )
+    op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
+    op.create_table('users',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('email', sa.String(length=225), nullable=False),
+    sa.Column('hashed_password', sa.String(), nullable=False),
+    sa.Column('fullname', sa.String(length=225), nullable=False),
+    sa.Column('is_active', sa.Boolean(), nullable=True),
+    sa.Column('last_login', sa.DateTime(), nullable=True),
+    sa.Column('created_at', sa.DateTime(), nullable=True),
+    sa.Column('updated_at', sa.DateTime(), nullable=True),
+    sa.Column('company_id', sa.Integer(), nullable=True),
+    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('email'),
+    sa.UniqueConstraint('fullname'),
+    sa.UniqueConstraint('fullname', name='unique_username_no_spacing')
+    )
+    op.create_table('document',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('filename', sa.String(length=225), nullable=False),
+    sa.Column('uploaded_by', sa.Integer(), nullable=False),
+    sa.Column('uploaded_at', sa.DateTime(), nullable=False),
+    sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
+    sa.PrimaryKeyConstraint('id'),
+    sa.UniqueConstraint('filename')
+    )
+    op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
+    op.create_table('user_roles',
+    sa.Column('user_id', sa.Integer(), nullable=False),
+    sa.Column('role_id', sa.Integer(), nullable=False),
+    sa.Column('company_id', sa.Integer(), nullable=False),
+    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
+    sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
+    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
+    sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
+    sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('user_roles')
+    op.drop_index(op.f('ix_document_id'), table_name='document')
+    op.drop_table('document')
+    op.drop_table('users')
+    op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
+    op.drop_table('subscriptions')
+    op.drop_index(op.f('ix_roles_name'), table_name='roles')
+    op.drop_index(op.f('ix_roles_id'), table_name='roles')
+    op.drop_table('roles')
+    op.drop_index(op.f('ix_companies_name'), table_name='companies')
+    op.drop_index(op.f('ix_companies_id'), table_name='companies')
+    op.drop_table('companies')
+    # ### end Alembic commands ###
--- a/poetry.lock
+++ b/poetry.lock
--- a/private_gpt/components/ocr_components/table_ocr_api.py
+++ b/private_gpt/components/ocr_components/table_ocr_api.py
@ -1,19 +1,19 @@
-from fastapi import FastAPI, File, UploadFile, Response, APIRouter
+from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from docx import Document
 import os
 import fitz
-
+import requests
 from private_gpt.components.ocr_components.TextExtraction import ImageToTable
 from private_gpt.components.ocr_components.table_ocr import GetOCRText
-
-upload_dir = rf"F:\LLM\privateGPT\private_gpt\uploads"
+from private_gpt.server.ingest.ingest_router import ingest_file
+upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"

 pdf_router = APIRouter(prefix="/pdf", tags=["auth"])

@pdf_router.post("/pdf_ocr")
-async def get_pdf_ocr(file: UploadFile = File(...)):
+async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
    UPLOAD_DIR = upload_dir
    try:
        contents = await file.read()
@ -49,7 +49,16 @@ async def get_pdf_ocr(file: UploadFile = File(...)):
            doc.add_paragraph(table_data)
            # remove image file

-    doc.save(os.path.join(UPLOAD_DIR, "ocr_result.docx"))
-    
+    save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
+    doc.save(save_path)
+
+    with open(save_path,'rb') as f:
+        file_content = f.read()
+        starfleet_data = {
+            "filename": f.name,
+            "file_content": file_content,
+            "file_type": "multipart/form-data"
+        }
+    requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
    return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")

--- a/private_gpt/launcher.py
+++ b/private_gpt/launcher.py
@ -42,8 +42,10 @@ def create_app(root_injector: Injector) -> FastAPI:
        app.add_middleware(
            CORSMiddleware,
            allow_credentials=True,
-            allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80",
-                           "http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88", "http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173"],
+            allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80", "http://127.0.0.1",
+                           "http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88", 
+                           "http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173", "http://127.0.0.1/", "http://localhost/", 
+                           "http://localhost:80", "http://192.168.1.131:80/", "http://192.168.1.131"],
            allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"],
            allow_headers=["*"],
        )
--- a/private_gpt/server/ingest/ingest_router.py
+++ b/private_gpt/server/ingest/ingest_router.py
@ -178,6 +178,60 @@ def ingest_file(
        )) -> IngestResponse:
    """Ingests and processes a file, storing its chunks to be used as context."""
    service = request.state.injector.get(IngestService)
+    print("-------------------------------------->",file)
+    try:
+        file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
+        if file_ingested:
+            raise HTTPException(
+                status_code=status.HTTP_409_CONFLICT,
+                detail="File already exists. Choose a different file.",
+            )
+        
+        if file.filename is None:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="No file name provided",
+            )
+
+        try:
+            docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
+            crud.documents.create(db=db, obj_in=docs_in)
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Unable to upload file.",
+            )
+        upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
+
+        with open(upload_path, "wb") as f:
+            f.write(file.file.read())
+
+        with open(upload_path, "rb") as f:
+            ingested_documents = service.ingest_bin_data(file.filename, f)
+        logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
+
+        return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
+    except HTTPException:
+        raise
+
+    except Exception as e:
+        logger.error(f"There was an error uploading the file(s): {str(e)}")
+        print("ERROR: ", e)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Internal Server Error: Unable to ingest file.",
+        )
+
+
+
+
+def ingest_pdf_file(
+        request: Request,
+        db: Session = Depends(deps.get_db),
+        file: UploadFile = File(...),
+) -> IngestResponse:
+    """Ingests and processes a file, storing its chunks to be used as context."""
+    service = request.state.injector.get(IngestService)

    try:
        file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
--- a/private_gpt/users/api/v1/routers/auth.py
+++ b/private_gpt/users/api/v1/routers/auth.py
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
 from private_gpt.users.utils import send_registration_email, Ldap

 LDAP_SERVER = settings.LDAP_SERVER
-LDAP_ENABLE = True
+LDAP_ENABLE = False

 router = APIRouter(prefix="/auth", tags=["auth"])

@ -104,17 +104,17 @@ def login_access_token(
    """
    OAuth2 compatible token login, get an access token for future requests
    """
-    if LDAP_ENABLE:
-        existing_user = crud.user.get_by_email(db, email=form_data.username)
+    # if LDAP_ENABLE:
+    #     existing_user = crud.user.get_by_email(db, email=form_data.username)
        
-        if existing_user:
-            if existing_user.user_role.role.name == "SUPER_ADMIN":
-                pass
-            else:
-                ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
-        else:
-            ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
-            ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
+    #     if existing_user:
+    #         if existing_user.user_role.role.name == "SUPER_ADMIN":
+    #             pass
+    #         else:
+    #             ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
+    #     else:
+    #         ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
+    #         ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)

    user = crud.user.authenticate(
        db, email=form_data.username, password=form_data.password
@ -212,7 +212,7 @@ def register(
            detail="The user with this email already exists!",
        )
    random_password = security.generate_random_password()
-
+    # random_password = password
    try:
        if company_id:
            # Registering user with a specific company
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,7 +24,10 @@ python-jose = "^3.3.0"
 psycopg2-binary = "^2.9.9"
 passlib = "^1.7.4"
 docx2txt = "^0.8"
-torch = "2.1.2"
+gradio = "^4.18.0"
+ldap3 = "^2.9.1"
+uvicorn = "^0.27.1"
+python-doctr = {extras = ["torch"], version = "^0.7.0"}

 [tool.poetry.group.dev.dependencies]
 black = "^22"
--- a/req.txt
+++ b/req.txt