Update with verify function to check the maker request for documents

This commit is contained in:
Saurab-Shrestha 2024-03-17 17:58:42 +05:45
parent 2bcc7f589f
commit f7de7c3b54
15 changed files with 954 additions and 1048 deletions

4
.env
View File

@ -4,8 +4,8 @@ ENVIRONMENT=dev
DB_HOST=localhost DB_HOST=localhost
DB_USER=postgres DB_USER=postgres
DB_PORT=5432 DB_PORT=5432
DB_PASSWORD=admin DB_PASSWORD=quick
DB_NAME=GPT DB_NAME=QuickGpt
SUPER_ADMIN_EMAIL=superadmin@email.com SUPER_ADMIN_EMAIL=superadmin@email.com
SUPER_ADMIN_PASSWORD=supersecretpassword SUPER_ADMIN_PASSWORD=supersecretpassword

View File

@ -1,8 +1,8 @@
"""udpate cascade """update
Revision ID: 39c817e4fc4a Revision ID: b7b896502e8e
Revises: cb320e7880fc Revises:
Create Date: 2024-03-17 11:08:22.426368 Create Date: 2024-03-17 15:07:10.795935
""" """
from typing import Sequence, Union from typing import Sequence, Union
@ -12,16 +12,16 @@ import sqlalchemy as sa
# revision identifiers, used by Alembic. # revision identifiers, used by Alembic.
revision: str = '39c817e4fc4a' revision: str = 'b7b896502e8e'
down_revision: Union[str, None] = 'cb320e7880fc' down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None: def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ### # ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
op.drop_constraint('document_department_association_department_id_fkey', 'document_department_association', type_='foreignkey') op.drop_constraint('document_department_association_department_id_fkey', 'document_department_association', type_='foreignkey')
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
op.create_foreign_key(None, 'document_department_association', 'document', ['document_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE') op.create_foreign_key(None, 'document_department_association', 'document', ['document_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
op.create_foreign_key(None, 'document_department_association', 'departments', ['department_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE') op.create_foreign_key(None, 'document_department_association', 'departments', ['department_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id']) # op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
@ -33,6 +33,6 @@ def downgrade() -> None:
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique') # op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
op.drop_constraint(None, 'document_department_association', type_='foreignkey') op.drop_constraint(None, 'document_department_association', type_='foreignkey')
op.drop_constraint(None, 'document_department_association', type_='foreignkey') op.drop_constraint(None, 'document_department_association', type_='foreignkey')
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
op.create_foreign_key('document_department_association_document_id_fkey', 'document_department_association', 'document', ['document_id'], ['id']) op.create_foreign_key('document_department_association_document_id_fkey', 'document_department_association', 'document', ['document_id'], ['id'])
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
# ### end Alembic commands ### # ### end Alembic commands ###

View File

@ -1,157 +0,0 @@
"""update
Revision ID: cb320e7880fc
Revises:
Create Date: 2024-03-17 10:09:40.034197
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision: str = 'cb320e7880fc'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('companies',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
op.create_table('document_type',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('type', sa.String(length=225), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('type')
)
op.create_index(op.f('ix_document_type_id'), 'document_type', ['id'], unique=False)
op.create_table('roles',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(length=100), nullable=True),
sa.Column('description', sa.Text(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
op.create_table('departments',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=True),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('total_users', sa.Integer(), nullable=True),
sa.Column('total_documents', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_departments_id'), 'departments', ['id'], unique=False)
op.create_index(op.f('ix_departments_name'), 'departments', ['name'], unique=True)
op.create_table('subscriptions',
sa.Column('sub_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('start_date', sa.DateTime(), nullable=True),
sa.Column('end_date', sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('sub_id')
)
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
op.create_table('users',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('email', sa.String(length=225), nullable=False),
sa.Column('hashed_password', sa.String(), nullable=False),
sa.Column('username', sa.String(length=225), nullable=False),
sa.Column('is_active', sa.Boolean(), nullable=True),
sa.Column('last_login', sa.DateTime(), nullable=True),
sa.Column('created_at', sa.DateTime(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=True),
sa.Column('password_created', sa.DateTime(), nullable=True),
sa.Column('checker', sa.Boolean(), nullable=True),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('department_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('email'),
sa.UniqueConstraint('username'),
sa.UniqueConstraint('username', name='unique_username_no_spacing')
)
op.create_table('audit',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('timestamp', sa.DateTime(), nullable=False),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('model', sa.String(length=100), nullable=False),
sa.Column('action', sa.String(length=50), nullable=False),
sa.Column('details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('ip_address', sa.String(length=45), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='SET NULL'),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_audit_id'), 'audit', ['id'], unique=False)
op.create_table('document',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('filename', sa.String(length=225), nullable=False),
sa.Column('uploaded_by', sa.Integer(), nullable=False),
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
sa.Column('is_enabled', sa.Boolean(), nullable=True),
sa.Column('verified', sa.Boolean(), nullable=True),
sa.Column('doc_type_id', sa.Integer(), nullable=True),
sa.Column('action_type', sa.Enum('INSERT', 'UPDATE', 'DELETE', name='makercheckeractiontype'), nullable=False),
sa.Column('status', sa.Enum('PENDING', 'APPROVED', 'REJECTED', name='makercheckerstatus'), nullable=False),
sa.Column('verified_at', sa.DateTime(), nullable=True),
sa.Column('verified_by', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['doc_type_id'], ['document_type.id'], ),
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
sa.ForeignKeyConstraint(['verified_by'], ['users.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('filename')
)
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
op.create_table('user_roles',
sa.Column('user_id', sa.Integer(), nullable=False),
sa.Column('role_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
)
op.create_table('document_department_association',
sa.Column('department_id', sa.Integer(), nullable=True),
sa.Column('document_id', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
sa.ForeignKeyConstraint(['document_id'], ['document.id'], )
)
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('document_department_association')
op.drop_table('user_roles')
op.drop_index(op.f('ix_document_id'), table_name='document')
op.drop_table('document')
op.drop_index(op.f('ix_audit_id'), table_name='audit')
op.drop_table('audit')
op.drop_table('users')
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
op.drop_table('subscriptions')
op.drop_index(op.f('ix_departments_name'), table_name='departments')
op.drop_index(op.f('ix_departments_id'), table_name='departments')
op.drop_table('departments')
op.drop_index(op.f('ix_roles_name'), table_name='roles')
op.drop_index(op.f('ix_roles_id'), table_name='roles')
op.drop_table('roles')
op.drop_index(op.f('ix_document_type_id'), table_name='document_type')
op.drop_table('document_type')
op.drop_index(op.f('ix_companies_name'), table_name='companies')
op.drop_index(op.f('ix_companies_id'), table_name='companies')
op.drop_table('companies')
# ### end Alembic commands ###

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

2
models/.gitignore vendored
View File

@ -1,2 +0,0 @@
*
!.gitignore

1714
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@ from private_gpt.users import models, schemas
from private_gpt.users.constants.role import Role from private_gpt.users.constants.role import Role
from private_gpt.components.ocr_components.table_ocr import GetOCRText from private_gpt.components.ocr_components.table_ocr import GetOCRText
from private_gpt.components.ocr_components.TextExtraction import ImageToTable from private_gpt.components.ocr_components.TextExtraction import ImageToTable
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse, ingest
pdf_router = APIRouter(prefix="/v1", tags=["ocr"]) pdf_router = APIRouter(prefix="/v1", tags=["ocr"])
@ -80,9 +80,7 @@ async def process_pdf_ocr(
): ):
UPLOAD_DIR = OCR_UPLOAD UPLOAD_DIR = OCR_UPLOAD
try: try:
print("The file name is: ", file.filename)
pdf_path = await save_uploaded_file(file, UPLOAD_DIR) pdf_path = await save_uploaded_file(file, UPLOAD_DIR)
print("The file path: ", pdf_path)
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR) ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_documents = await common_ingest_logic( ingested_documents = await common_ingest_logic(
request=request, db=db, ocr_file=ocr_doc_path, current_user=current_user, original_file=None, log_audit=log_audit, departments=departments request=request, db=db, ocr_file=ocr_doc_path, current_user=current_user, original_file=None, log_audit=log_audit, departments=departments
@ -96,6 +94,43 @@ async def process_pdf_ocr(
detail=f"There was an error processing OCR: {e}" detail=f"There was an error processing OCR: {e}"
) )
async def process_ocr(
request: Request,
pdf_path: str,
):
UPLOAD_DIR = OCR_UPLOAD
try:
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_documents = await ingest(request=request, file_path=ocr_doc_path)
return ingested_documents
except Exception as e:
print(traceback.print_exc())
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"There was an error processing OCR: {e}"
)
async def process_both_ocr(
request: Request,
pdf_path: str
):
UPLOAD_DIR = OCR_UPLOAD
try:
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_ocr_documents = await ingest(request=request, file_path=ocr_doc_path) # ingest ocr
ingested_documents = await ingest(request=request, file_path=pdf_path) # ingest pdf
return ingested_documents
except Exception as e:
print(traceback.print_exc())
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"There was an error processing OCR: {e}"
)
async def process_both( async def process_both(
request: Request, request: Request,
db: Session, db: Session,

View File

@ -44,13 +44,13 @@ class IngestResponse(BaseModel):
class DeleteFilename(BaseModel): class DeleteFilename(BaseModel):
filename: str filename: str
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True) # @ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
def ingest(request: Request, file: UploadFile) -> IngestResponse: # def ingest(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file. # """Ingests and processes a file.
Deprecated. Use ingest/file instead. # Deprecated. Use ingest/file instead.
""" # """
return ingest_file(request, file) # return ingest_file(request, file)
@ingest_router.post("/ingest/file1", tags=["Ingestion"]) @ingest_router.post("/ingest/file1", tags=["Ingestion"])
@ -205,7 +205,8 @@ async def create_documents(
filename=file_name, filename=file_name,
uploaded_by=current_user.id, uploaded_by=current_user.id,
action_type=MakerCheckerActionType.INSERT, action_type=MakerCheckerActionType.INSERT,
status=MakerCheckerStatus.PENDING status=MakerCheckerStatus.PENDING,
doc_type_id=departments.doc_type_id,
) )
print("DOCUMENT CREATE: ", docs_in) print("DOCUMENT CREATE: ", docs_in)
document = crud.documents.create(db=db, obj_in=docs_in) document = crud.documents.create(db=db, obj_in=docs_in)
@ -298,35 +299,26 @@ async def common_ingest_logic(
async def ingest(request: Request, file_path: str) -> IngestResponse: async def ingest(request: Request, file_path: str) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context. """Ingests and processes a file, storing its chunks to be used as context."""
The context obtained from files is later used in
`/chat/completions`, `/completions`, and `/chunks` APIs.
Most common document
formats are supported, but you may be prompted to install an extra dependency to
manage a specific file type.
A file can generate different Documents (for example a PDF generates one Document
per page). All Documents IDs are returned in the response, together with the
extracted Metadata (which is later used to improve context retrieval). Those IDs
can be used to filter the context used to create responses in
`/chat/completions`, `/completions`, and `/chunks` APIs.
"""
service = request.state.injector.get(IngestService) service = request.state.injector.get(IngestService)
try: try:
with open(file_path, 'rb') as file: with open(file_path, 'rb') as file:
file_name = Path(file_path).name file_name = Path(file_path).name
upload_path = Path(f"{UPLOAD_DIR}/{file_name}") upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
with open(upload_path, "wb") as f: with upload_path.open('wb') as f:
f.write(file.file.read()) f.write(file.read())
with open(upload_path, "rb") as f:
ingested_documents = service.ingest_bin_data(file.filename, f) with upload_path.open('rb') as f:
ingested_documents = await service.ingest_bin_data(file_name, f)
except Exception as e: except Exception as e:
return {"message": f"There was an error uploading the file(s)\n {e}"} return {"message": f"There was an error uploading the file(s)\n {e}"}
finally: finally:
file.file.close() upload_path.unlink(missing_ok=True)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents) return IngestResponse(object="list", model="private-gpt", data=ingested_documents)

View File

@ -72,7 +72,7 @@ class IngestService:
logger.debug("Ingesting text data with file_name=%s", file_name) logger.debug("Ingesting text data with file_name=%s", file_name)
return self._ingest_data(file_name, text) return self._ingest_data(file_name, text)
def ingest_bin_data( async def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]: ) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name) logger.debug("Ingesting binary data with file_name=%s", file_name)

View File

@ -15,6 +15,7 @@ from private_gpt.users.constants.role import Role
from private_gpt.users import crud, models, schemas from private_gpt.users import crud, models, schemas
from private_gpt.server.ingest.ingest_router import create_documents, ingest from private_gpt.server.ingest.ingest_router import create_documents, ingest
from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus
from private_gpt.components.ocr_components.table_ocr_api import process_both_ocr, process_ocr
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
router = APIRouter(prefix='/documents', tags=['Documents']) router = APIRouter(prefix='/documents', tags=['Documents'])
@ -262,7 +263,7 @@ async def upload_documents(
) )
@router.post('/verify', response_model=schemas.Document) @router.post('/verify')
async def verify_documents( async def verify_documents(
request: Request, request: Request,
checker_in: schemas.DocumentUpdate = Depends(), checker_in: schemas.DocumentUpdate = Depends(),
@ -284,8 +285,7 @@ async def verify_documents(
detail="Document not found!", detail="Document not found!",
) )
unchecked_path = Path(f"{UNCHECKED_DIR}/{document.filename}") unchecked_path = Path(f"{UNCHECKED_DIR}/{document.filename}")
print(checker_in.status)
print(MakerCheckerStatus.APPROVED.value)
if checker_in.status == MakerCheckerStatus.APPROVED.value: if checker_in.status == MakerCheckerStatus.APPROVED.value:
checker = schemas.DocumentCheckerUpdate( checker = schemas.DocumentCheckerUpdate(
status=MakerCheckerStatus.APPROVED, status=MakerCheckerStatus.APPROVED,
@ -296,9 +296,9 @@ async def verify_documents(
crud.documents.update(db=db, db_obj= document, obj_in=checker) crud.documents.update(db=db, db_obj= document, obj_in=checker)
if document.doc_type_id == 2: if document.doc_type_id == 2:
return await ingest(request, unchecked_path) return await process_ocr(request, unchecked_path)
elif document.doc_type_id == 3: elif document.doc_type_id == 3:
return await ingest(request, unchecked_path) return await process_both_ocr(request, unchecked_path)
else: else:
return await ingest(request, unchecked_path) return await ingest(request, unchecked_path)

View File

@ -45,7 +45,7 @@ class Document(BaseModel):
class DocumentMakerChecker(DocumentCreate): class DocumentMakerChecker(DocumentCreate):
action_type: str action_type: str
status: str status: str
doc_type_id: int
class DocumentMakerCreate(DocumentMakerChecker): class DocumentMakerCreate(DocumentMakerChecker):
pass pass

View File

@ -50,6 +50,8 @@ boto3 = {version ="^1.34.51", optional = true}
# Optional UI # Optional UI
gradio = {version ="^4.19.2", optional = true} gradio = {version ="^4.19.2", optional = true}
aiofiles = "^23.2.1"
timm = "^0.9.16"
[tool.poetry.extras] [tool.poetry.extras]
ui = ["gradio"] ui = ["gradio"]