Update with verify function to check the maker request for documents

2025-06-30 09:12:17 +00:00 · 2024-03-17 17:58:42 +05:45 · 2024-03-17 17:58:42 +05:45 · f7de7c3b54
commit f7de7c3b54
parent 2bcc7f589f
15 changed files with 954 additions and 1048 deletions
--- a/.env
+++ b/.env
@ -4,8 +4,8 @@ ENVIRONMENT=dev
 DB_HOST=localhost
 DB_USER=postgres
 DB_PORT=5432
-DB_PASSWORD=admin
-DB_NAME=GPT
+DB_PASSWORD=quick
+DB_NAME=QuickGpt

 SUPER_ADMIN_EMAIL=superadmin@email.com
 SUPER_ADMIN_PASSWORD=supersecretpassword
--- a/alembic/versions/39c817e4fc4a_udpate_cascade.py
+++ b/alembic/versions/39c817e4fc4a_udpate_cascade.py
@ -1,8 +1,8 @@
-"""udpate cascade
+"""update

-Revision ID: 39c817e4fc4a
-Revises: cb320e7880fc
-Create Date: 2024-03-17 11:08:22.426368
+Revision ID: b7b896502e8e
+Revises: 
+Create Date: 2024-03-17 15:07:10.795935

 """
 from typing import Sequence, Union
@ -12,16 +12,16 @@ import sqlalchemy as sa


 # revision identifiers, used by Alembic.
-revision: str = '39c817e4fc4a'
-down_revision: Union[str, None] = 'cb320e7880fc'
+revision: str = 'b7b896502e8e'
+down_revision: Union[str, None] = None
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None


 def upgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
    op.drop_constraint('document_department_association_department_id_fkey', 'document_department_association', type_='foreignkey')
+    op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
    op.create_foreign_key(None, 'document_department_association', 'document', ['document_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
    op.create_foreign_key(None, 'document_department_association', 'departments', ['department_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
    # op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
@ -33,6 +33,6 @@ def downgrade() -> None:
    # op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
    op.drop_constraint(None, 'document_department_association', type_='foreignkey')
    op.drop_constraint(None, 'document_department_association', type_='foreignkey')
-    op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
    op.create_foreign_key('document_department_association_document_id_fkey', 'document_department_association', 'document', ['document_id'], ['id'])
+    op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
    # ### end Alembic commands ###
--- a/alembic/versions/cb320e7880fc_update.py
+++ b/alembic/versions/cb320e7880fc_update.py
@ -1,157 +0,0 @@
-"""update
-
-Revision ID: cb320e7880fc
-Revises: 
-Create Date: 2024-03-17 10:09:40.034197
-
-"""
-from typing import Sequence, Union
-
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision: str = 'cb320e7880fc'
-down_revision: Union[str, None] = None
-branch_labels: Union[str, Sequence[str], None] = None
-depends_on: Union[str, Sequence[str], None] = None
-
-
-def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('companies',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('name', sa.String(), nullable=True),
-    sa.PrimaryKeyConstraint('id')
-    )
-    op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
-    op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
-    op.create_table('document_type',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('type', sa.String(length=225), nullable=False),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('type')
-    )
-    op.create_index(op.f('ix_document_type_id'), 'document_type', ['id'], unique=False)
-    op.create_table('roles',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('name', sa.String(length=100), nullable=True),
-    sa.Column('description', sa.Text(), nullable=True),
-    sa.PrimaryKeyConstraint('id')
-    )
-    op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
-    op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
-    op.create_table('departments',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('name', sa.String(), nullable=True),
-    sa.Column('company_id', sa.Integer(), nullable=True),
-    sa.Column('total_users', sa.Integer(), nullable=True),
-    sa.Column('total_documents', sa.Integer(), nullable=True),
-    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
-    sa.PrimaryKeyConstraint('id')
-    )
-    op.create_index(op.f('ix_departments_id'), 'departments', ['id'], unique=False)
-    op.create_index(op.f('ix_departments_name'), 'departments', ['name'], unique=True)
-    op.create_table('subscriptions',
-    sa.Column('sub_id', sa.Integer(), nullable=False),
-    sa.Column('company_id', sa.Integer(), nullable=True),
-    sa.Column('start_date', sa.DateTime(), nullable=True),
-    sa.Column('end_date', sa.DateTime(), nullable=True),
-    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
-    sa.PrimaryKeyConstraint('sub_id')
-    )
-    op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
-    op.create_table('users',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('email', sa.String(length=225), nullable=False),
-    sa.Column('hashed_password', sa.String(), nullable=False),
-    sa.Column('username', sa.String(length=225), nullable=False),
-    sa.Column('is_active', sa.Boolean(), nullable=True),
-    sa.Column('last_login', sa.DateTime(), nullable=True),
-    sa.Column('created_at', sa.DateTime(), nullable=True),
-    sa.Column('updated_at', sa.DateTime(), nullable=True),
-    sa.Column('password_created', sa.DateTime(), nullable=True),
-    sa.Column('checker', sa.Boolean(), nullable=True),
-    sa.Column('company_id', sa.Integer(), nullable=True),
-    sa.Column('department_id', sa.Integer(), nullable=False),
-    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
-    sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('email'),
-    sa.UniqueConstraint('username'),
-    sa.UniqueConstraint('username', name='unique_username_no_spacing')
-    )
-    op.create_table('audit',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('timestamp', sa.DateTime(), nullable=False),
-    sa.Column('user_id', sa.Integer(), nullable=True),
-    sa.Column('model', sa.String(length=100), nullable=False),
-    sa.Column('action', sa.String(length=50), nullable=False),
-    sa.Column('details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
-    sa.Column('ip_address', sa.String(length=45), nullable=True),
-    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='SET NULL'),
-    sa.PrimaryKeyConstraint('id')
-    )
-    op.create_index(op.f('ix_audit_id'), 'audit', ['id'], unique=False)
-    op.create_table('document',
-    sa.Column('id', sa.Integer(), nullable=False),
-    sa.Column('filename', sa.String(length=225), nullable=False),
-    sa.Column('uploaded_by', sa.Integer(), nullable=False),
-    sa.Column('uploaded_at', sa.DateTime(), nullable=False),
-    sa.Column('is_enabled', sa.Boolean(), nullable=True),
-    sa.Column('verified', sa.Boolean(), nullable=True),
-    sa.Column('doc_type_id', sa.Integer(), nullable=True),
-    sa.Column('action_type', sa.Enum('INSERT', 'UPDATE', 'DELETE', name='makercheckeractiontype'), nullable=False),
-    sa.Column('status', sa.Enum('PENDING', 'APPROVED', 'REJECTED', name='makercheckerstatus'), nullable=False),
-    sa.Column('verified_at', sa.DateTime(), nullable=True),
-    sa.Column('verified_by', sa.Integer(), nullable=True),
-    sa.ForeignKeyConstraint(['doc_type_id'], ['document_type.id'], ),
-    sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
-    sa.ForeignKeyConstraint(['verified_by'], ['users.id'], ),
-    sa.PrimaryKeyConstraint('id'),
-    sa.UniqueConstraint('filename')
-    )
-    op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
-    op.create_table('user_roles',
-    sa.Column('user_id', sa.Integer(), nullable=False),
-    sa.Column('role_id', sa.Integer(), nullable=False),
-    sa.Column('company_id', sa.Integer(), nullable=False),
-    sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
-    sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
-    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
-    sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
-    sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
-    )
-    op.create_table('document_department_association',
-    sa.Column('department_id', sa.Integer(), nullable=True),
-    sa.Column('document_id', sa.Integer(), nullable=True),
-    sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
-    sa.ForeignKeyConstraint(['document_id'], ['document.id'], )
-    )
-    # ### end Alembic commands ###
-
-
-def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_table('document_department_association')
-    op.drop_table('user_roles')
-    op.drop_index(op.f('ix_document_id'), table_name='document')
-    op.drop_table('document')
-    op.drop_index(op.f('ix_audit_id'), table_name='audit')
-    op.drop_table('audit')
-    op.drop_table('users')
-    op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
-    op.drop_table('subscriptions')
-    op.drop_index(op.f('ix_departments_name'), table_name='departments')
-    op.drop_index(op.f('ix_departments_id'), table_name='departments')
-    op.drop_table('departments')
-    op.drop_index(op.f('ix_roles_name'), table_name='roles')
-    op.drop_index(op.f('ix_roles_id'), table_name='roles')
-    op.drop_table('roles')
-    op.drop_index(op.f('ix_document_type_id'), table_name='document_type')
-    op.drop_table('document_type')
-    op.drop_index(op.f('ix_companies_name'), table_name='companies')
-    op.drop_index(op.f('ix_companies_id'), table_name='companies')
-    op.drop_table('companies')
-    # ### end Alembic commands ###
--- a/ffmpy-0.3.2-py3-none-any.whl
+++ b/ffmpy-0.3.2-py3-none-any.whl
--- a/local_data/private_gpt/docstore.json
+++ b/local_data/private_gpt/docstore.json
--- a/local_data/private_gpt/index_store.json
+++ b/local_data/private_gpt/index_store.json
--- a/local_data/private_gpt/qdrant/collection/make_this_parameterizable_per_api_call/storage.sqlite
+++ b/local_data/private_gpt/qdrant/collection/make_this_parameterizable_per_api_call/storage.sqlite
--- a/models/.gitignore
+++ b/models/.gitignore
@ -1,2 +0,0 @@
-*
-!.gitignore
--- a/poetry.lock
+++ b/poetry.lock
--- a/private_gpt/components/ocr_components/table_ocr_api.py
+++ b/private_gpt/components/ocr_components/table_ocr_api.py
@ -13,7 +13,7 @@ from private_gpt.users import models, schemas
 from private_gpt.users.constants.role import Role
 from private_gpt.components.ocr_components.table_ocr import GetOCRText
 from private_gpt.components.ocr_components.TextExtraction import ImageToTable
-from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
+from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse, ingest
 pdf_router = APIRouter(prefix="/v1", tags=["ocr"])


@ -80,9 +80,7 @@ async def process_pdf_ocr(
 ):
    UPLOAD_DIR = OCR_UPLOAD
    try:
-        print("The file name is: ", file.filename)
        pdf_path = await save_uploaded_file(file, UPLOAD_DIR)
-        print("The file path: ", pdf_path)
        ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
        ingested_documents = await common_ingest_logic(
            request=request, db=db, ocr_file=ocr_doc_path, current_user=current_user, original_file=None, log_audit=log_audit, departments=departments
@ -96,6 +94,43 @@ async def process_pdf_ocr(
            detail=f"There was an error processing OCR: {e}"
        )

+
+async def process_ocr(
+        request: Request,
+        pdf_path: str,
+):
+    UPLOAD_DIR = OCR_UPLOAD
+    try:
+        ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
+        ingested_documents = await ingest(request=request, file_path=ocr_doc_path)
+        return ingested_documents
+    except Exception as e:
+        print(traceback.print_exc())
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"There was an error processing OCR: {e}"
+        )
+
+async def process_both_ocr(
+        request: Request, 
+        pdf_path: str
+):
+    UPLOAD_DIR = OCR_UPLOAD
+    try:
+        ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
+        ingested_ocr_documents = await ingest(request=request, file_path=ocr_doc_path) # ingest ocr
+        ingested_documents = await ingest(request=request, file_path=pdf_path) # ingest pdf 
+        return ingested_documents
+    
+    except Exception as e:
+        print(traceback.print_exc())
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"There was an error processing OCR: {e}"
+        )
+
+    
+
 async def process_both(
    request: Request,
    db: Session,
--- a/private_gpt/server/ingest/ingest_router.py
+++ b/private_gpt/server/ingest/ingest_router.py
@ -44,13 +44,13 @@ class IngestResponse(BaseModel):
 class DeleteFilename(BaseModel):
    filename: str

-@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
-def ingest(request: Request, file: UploadFile) -> IngestResponse:
-    """Ingests and processes a file.
+# @ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
+# def ingest(request: Request, file: UploadFile) -> IngestResponse:
+#     """Ingests and processes a file.

-    Deprecated. Use ingest/file instead.
-    """
-    return ingest_file(request, file)
+#     Deprecated. Use ingest/file instead.
+#     """
+#     return ingest_file(request, file)


@ingest_router.post("/ingest/file1", tags=["Ingestion"])
@ -205,7 +205,8 @@ async def create_documents(
        filename=file_name, 
        uploaded_by=current_user.id, 
        action_type=MakerCheckerActionType.INSERT,
-        status=MakerCheckerStatus.PENDING
+        status=MakerCheckerStatus.PENDING,
+        doc_type_id=departments.doc_type_id,
    )
    print("DOCUMENT CREATE: ", docs_in)
    document = crud.documents.create(db=db, obj_in=docs_in)
@ -298,35 +299,26 @@ async def common_ingest_logic(


 async def ingest(request: Request, file_path: str) -> IngestResponse:
-    """Ingests and processes a file, storing its chunks to be used as context.
-
-    The context obtained from files is later used in
-    `/chat/completions`, `/completions`, and `/chunks` APIs.
-
-    Most common document
-    formats are supported, but you may be prompted to install an extra dependency to
-    manage a specific file type.
-
-    A file can generate different Documents (for example a PDF generates one Document
-    per page). All Documents IDs are returned in the response, together with the
-    extracted Metadata (which is later used to improve context retrieval). Those IDs
-    can be used to filter the context used to create responses in
-    `/chat/completions`, `/completions`, and `/chunks` APIs.
-    """
+    """Ingests and processes a file, storing its chunks to be used as context."""
    service = request.state.injector.get(IngestService)
+
    try:
        with open(file_path, 'rb') as file:
            file_name = Path(file_path).name
            upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
-            
-            with open(upload_path, "wb") as f:
-                f.write(file.file.read())
-            with open(upload_path, "rb") as f:
-                ingested_documents = service.ingest_bin_data(file.filename, f)
+
+            with upload_path.open('wb') as f:
+                f.write(file.read())
+
+            with upload_path.open('rb') as f:
+                ingested_documents = await service.ingest_bin_data(file_name, f)
+
    except Exception as e:
        return {"message": f"There was an error uploading the file(s)\n {e}"}
+
    finally:
-        file.file.close()
+        upload_path.unlink(missing_ok=True)
+
    return IngestResponse(object="list", model="private-gpt", data=ingested_documents)


--- a/private_gpt/server/ingest/ingest_service.py
+++ b/private_gpt/server/ingest/ingest_service.py
@ -72,7 +72,7 @@ class IngestService:
        logger.debug("Ingesting text data with file_name=%s", file_name)
        return self._ingest_data(file_name, text)

-    def ingest_bin_data(
+    async def ingest_bin_data(
        self, file_name: str, raw_file_data: BinaryIO
    ) -> list[IngestedDoc]:
        logger.debug("Ingesting binary data with file_name=%s", file_name)
--- a/private_gpt/users/api/v1/routers/documents.py
+++ b/private_gpt/users/api/v1/routers/documents.py
@ -15,6 +15,7 @@ from private_gpt.users.constants.role import Role
 from private_gpt.users import crud, models, schemas
 from private_gpt.server.ingest.ingest_router import create_documents, ingest
 from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus
+from private_gpt.components.ocr_components.table_ocr_api import process_both_ocr, process_ocr

 logger = logging.getLogger(__name__)
 router = APIRouter(prefix='/documents', tags=['Documents'])
@ -262,7 +263,7 @@ async def upload_documents(
        )


-@router.post('/verify', response_model=schemas.Document)
+@router.post('/verify')
 async def verify_documents(
    request: Request,
    checker_in: schemas.DocumentUpdate = Depends(),
@ -284,8 +285,7 @@ async def verify_documents(
                detail="Document not found!",
            )
        unchecked_path = Path(f"{UNCHECKED_DIR}/{document.filename}")
-        print(checker_in.status)
-        print(MakerCheckerStatus.APPROVED.value)
+
        if checker_in.status == MakerCheckerStatus.APPROVED.value:
            checker = schemas.DocumentCheckerUpdate(
                    status=MakerCheckerStatus.APPROVED,
@ -296,9 +296,9 @@ async def verify_documents(
            crud.documents.update(db=db, db_obj= document, obj_in=checker)

            if document.doc_type_id == 2:
-                return await ingest(request, unchecked_path)
+                return await process_ocr(request, unchecked_path)
            elif document.doc_type_id == 3:
-                return await ingest(request, unchecked_path)
+                return await process_both_ocr(request, unchecked_path)
            else:
                return await ingest(request, unchecked_path)
            
--- a/private_gpt/users/schemas/documents.py
+++ b/private_gpt/users/schemas/documents.py
@ -45,7 +45,7 @@ class Document(BaseModel):
 class DocumentMakerChecker(DocumentCreate):
    action_type: str
    status: str
-
+    doc_type_id: int

 class DocumentMakerCreate(DocumentMakerChecker):
    pass
--- a/pyproject.toml
+++ b/pyproject.toml
@ -50,6 +50,8 @@ boto3 = {version ="^1.34.51", optional = true}

 # Optional UI
 gradio = {version ="^4.19.2", optional = true}
+aiofiles = "^23.2.1"
+timm = "^0.9.16"

 [tool.poetry.extras]
 ui = ["gradio"]