Update with verify function to check the maker request for documents

This commit is contained in:
Saurab-Shrestha 2024-03-17 17:58:42 +05:45
parent 2bcc7f589f
commit f7de7c3b54
15 changed files with 954 additions and 1048 deletions

4
.env
View File

@ -4,8 +4,8 @@ ENVIRONMENT=dev
DB_HOST=localhost
DB_USER=postgres
DB_PORT=5432
DB_PASSWORD=admin
DB_NAME=GPT
DB_PASSWORD=quick
DB_NAME=QuickGpt
SUPER_ADMIN_EMAIL=superadmin@email.com
SUPER_ADMIN_PASSWORD=supersecretpassword

View File

@ -1,8 +1,8 @@
"""udpate cascade
"""update
Revision ID: 39c817e4fc4a
Revises: cb320e7880fc
Create Date: 2024-03-17 11:08:22.426368
Revision ID: b7b896502e8e
Revises:
Create Date: 2024-03-17 15:07:10.795935
"""
from typing import Sequence, Union
@ -12,16 +12,16 @@ import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '39c817e4fc4a'
down_revision: Union[str, None] = 'cb320e7880fc'
revision: str = 'b7b896502e8e'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
op.drop_constraint('document_department_association_department_id_fkey', 'document_department_association', type_='foreignkey')
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
op.create_foreign_key(None, 'document_department_association', 'document', ['document_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
op.create_foreign_key(None, 'document_department_association', 'departments', ['department_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
@ -33,6 +33,6 @@ def downgrade() -> None:
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
op.drop_constraint(None, 'document_department_association', type_='foreignkey')
op.drop_constraint(None, 'document_department_association', type_='foreignkey')
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
op.create_foreign_key('document_department_association_document_id_fkey', 'document_department_association', 'document', ['document_id'], ['id'])
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
# ### end Alembic commands ###

View File

@ -1,157 +0,0 @@
"""update
Revision ID: cb320e7880fc
Revises:
Create Date: 2024-03-17 10:09:40.034197
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision: str = 'cb320e7880fc'
down_revision: Union[str, None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('companies',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
op.create_table('document_type',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('type', sa.String(length=225), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('type')
)
op.create_index(op.f('ix_document_type_id'), 'document_type', ['id'], unique=False)
op.create_table('roles',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(length=100), nullable=True),
sa.Column('description', sa.Text(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
op.create_table('departments',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=True),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('total_users', sa.Integer(), nullable=True),
sa.Column('total_documents', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_departments_id'), 'departments', ['id'], unique=False)
op.create_index(op.f('ix_departments_name'), 'departments', ['name'], unique=True)
op.create_table('subscriptions',
sa.Column('sub_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('start_date', sa.DateTime(), nullable=True),
sa.Column('end_date', sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.PrimaryKeyConstraint('sub_id')
)
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
op.create_table('users',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('email', sa.String(length=225), nullable=False),
sa.Column('hashed_password', sa.String(), nullable=False),
sa.Column('username', sa.String(length=225), nullable=False),
sa.Column('is_active', sa.Boolean(), nullable=True),
sa.Column('last_login', sa.DateTime(), nullable=True),
sa.Column('created_at', sa.DateTime(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=True),
sa.Column('password_created', sa.DateTime(), nullable=True),
sa.Column('checker', sa.Boolean(), nullable=True),
sa.Column('company_id', sa.Integer(), nullable=True),
sa.Column('department_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('email'),
sa.UniqueConstraint('username'),
sa.UniqueConstraint('username', name='unique_username_no_spacing')
)
op.create_table('audit',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('timestamp', sa.DateTime(), nullable=False),
sa.Column('user_id', sa.Integer(), nullable=True),
sa.Column('model', sa.String(length=100), nullable=False),
sa.Column('action', sa.String(length=50), nullable=False),
sa.Column('details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('ip_address', sa.String(length=45), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='SET NULL'),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_audit_id'), 'audit', ['id'], unique=False)
op.create_table('document',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('filename', sa.String(length=225), nullable=False),
sa.Column('uploaded_by', sa.Integer(), nullable=False),
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
sa.Column('is_enabled', sa.Boolean(), nullable=True),
sa.Column('verified', sa.Boolean(), nullable=True),
sa.Column('doc_type_id', sa.Integer(), nullable=True),
sa.Column('action_type', sa.Enum('INSERT', 'UPDATE', 'DELETE', name='makercheckeractiontype'), nullable=False),
sa.Column('status', sa.Enum('PENDING', 'APPROVED', 'REJECTED', name='makercheckerstatus'), nullable=False),
sa.Column('verified_at', sa.DateTime(), nullable=True),
sa.Column('verified_by', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['doc_type_id'], ['document_type.id'], ),
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
sa.ForeignKeyConstraint(['verified_by'], ['users.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('filename')
)
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
op.create_table('user_roles',
sa.Column('user_id', sa.Integer(), nullable=False),
sa.Column('role_id', sa.Integer(), nullable=False),
sa.Column('company_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
)
op.create_table('document_department_association',
sa.Column('department_id', sa.Integer(), nullable=True),
sa.Column('document_id', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
sa.ForeignKeyConstraint(['document_id'], ['document.id'], )
)
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('document_department_association')
op.drop_table('user_roles')
op.drop_index(op.f('ix_document_id'), table_name='document')
op.drop_table('document')
op.drop_index(op.f('ix_audit_id'), table_name='audit')
op.drop_table('audit')
op.drop_table('users')
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
op.drop_table('subscriptions')
op.drop_index(op.f('ix_departments_name'), table_name='departments')
op.drop_index(op.f('ix_departments_id'), table_name='departments')
op.drop_table('departments')
op.drop_index(op.f('ix_roles_name'), table_name='roles')
op.drop_index(op.f('ix_roles_id'), table_name='roles')
op.drop_table('roles')
op.drop_index(op.f('ix_document_type_id'), table_name='document_type')
op.drop_table('document_type')
op.drop_index(op.f('ix_companies_name'), table_name='companies')
op.drop_index(op.f('ix_companies_id'), table_name='companies')
op.drop_table('companies')
# ### end Alembic commands ###

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

2
models/.gitignore vendored
View File

@ -1,2 +0,0 @@
*
!.gitignore

1714
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,7 @@ from private_gpt.users import models, schemas
from private_gpt.users.constants.role import Role
from private_gpt.components.ocr_components.table_ocr import GetOCRText
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse, ingest
pdf_router = APIRouter(prefix="/v1", tags=["ocr"])
@ -80,9 +80,7 @@ async def process_pdf_ocr(
):
UPLOAD_DIR = OCR_UPLOAD
try:
print("The file name is: ", file.filename)
pdf_path = await save_uploaded_file(file, UPLOAD_DIR)
print("The file path: ", pdf_path)
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_documents = await common_ingest_logic(
request=request, db=db, ocr_file=ocr_doc_path, current_user=current_user, original_file=None, log_audit=log_audit, departments=departments
@ -96,6 +94,43 @@ async def process_pdf_ocr(
detail=f"There was an error processing OCR: {e}"
)
async def process_ocr(
request: Request,
pdf_path: str,
):
UPLOAD_DIR = OCR_UPLOAD
try:
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_documents = await ingest(request=request, file_path=ocr_doc_path)
return ingested_documents
except Exception as e:
print(traceback.print_exc())
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"There was an error processing OCR: {e}"
)
async def process_both_ocr(
request: Request,
pdf_path: str
):
UPLOAD_DIR = OCR_UPLOAD
try:
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
ingested_ocr_documents = await ingest(request=request, file_path=ocr_doc_path) # ingest ocr
ingested_documents = await ingest(request=request, file_path=pdf_path) # ingest pdf
return ingested_documents
except Exception as e:
print(traceback.print_exc())
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"There was an error processing OCR: {e}"
)
async def process_both(
request: Request,
db: Session,

View File

@ -44,13 +44,13 @@ class IngestResponse(BaseModel):
class DeleteFilename(BaseModel):
filename: str
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
def ingest(request: Request, file: UploadFile) -> IngestResponse:
"""Ingests and processes a file.
# @ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
# def ingest(request: Request, file: UploadFile) -> IngestResponse:
# """Ingests and processes a file.
Deprecated. Use ingest/file instead.
"""
return ingest_file(request, file)
# Deprecated. Use ingest/file instead.
# """
# return ingest_file(request, file)
@ingest_router.post("/ingest/file1", tags=["Ingestion"])
@ -205,7 +205,8 @@ async def create_documents(
filename=file_name,
uploaded_by=current_user.id,
action_type=MakerCheckerActionType.INSERT,
status=MakerCheckerStatus.PENDING
status=MakerCheckerStatus.PENDING,
doc_type_id=departments.doc_type_id,
)
print("DOCUMENT CREATE: ", docs_in)
document = crud.documents.create(db=db, obj_in=docs_in)
@ -298,35 +299,26 @@ async def common_ingest_logic(
async def ingest(request: Request, file_path: str) -> IngestResponse:
"""Ingests and processes a file, storing its chunks to be used as context.
The context obtained from files is later used in
`/chat/completions`, `/completions`, and `/chunks` APIs.
Most common document
formats are supported, but you may be prompted to install an extra dependency to
manage a specific file type.
A file can generate different Documents (for example a PDF generates one Document
per page). All Documents IDs are returned in the response, together with the
extracted Metadata (which is later used to improve context retrieval). Those IDs
can be used to filter the context used to create responses in
`/chat/completions`, `/completions`, and `/chunks` APIs.
"""
"""Ingests and processes a file, storing its chunks to be used as context."""
service = request.state.injector.get(IngestService)
try:
with open(file_path, 'rb') as file:
file_name = Path(file_path).name
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
with open(upload_path, "wb") as f:
f.write(file.file.read())
with open(upload_path, "rb") as f:
ingested_documents = service.ingest_bin_data(file.filename, f)
with upload_path.open('wb') as f:
f.write(file.read())
with upload_path.open('rb') as f:
ingested_documents = await service.ingest_bin_data(file_name, f)
except Exception as e:
return {"message": f"There was an error uploading the file(s)\n {e}"}
finally:
file.file.close()
upload_path.unlink(missing_ok=True)
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)

View File

@ -72,7 +72,7 @@ class IngestService:
logger.debug("Ingesting text data with file_name=%s", file_name)
return self._ingest_data(file_name, text)
def ingest_bin_data(
async def ingest_bin_data(
self, file_name: str, raw_file_data: BinaryIO
) -> list[IngestedDoc]:
logger.debug("Ingesting binary data with file_name=%s", file_name)

View File

@ -15,6 +15,7 @@ from private_gpt.users.constants.role import Role
from private_gpt.users import crud, models, schemas
from private_gpt.server.ingest.ingest_router import create_documents, ingest
from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus
from private_gpt.components.ocr_components.table_ocr_api import process_both_ocr, process_ocr
logger = logging.getLogger(__name__)
router = APIRouter(prefix='/documents', tags=['Documents'])
@ -262,7 +263,7 @@ async def upload_documents(
)
@router.post('/verify', response_model=schemas.Document)
@router.post('/verify')
async def verify_documents(
request: Request,
checker_in: schemas.DocumentUpdate = Depends(),
@ -284,8 +285,7 @@ async def verify_documents(
detail="Document not found!",
)
unchecked_path = Path(f"{UNCHECKED_DIR}/{document.filename}")
print(checker_in.status)
print(MakerCheckerStatus.APPROVED.value)
if checker_in.status == MakerCheckerStatus.APPROVED.value:
checker = schemas.DocumentCheckerUpdate(
status=MakerCheckerStatus.APPROVED,
@ -296,9 +296,9 @@ async def verify_documents(
crud.documents.update(db=db, db_obj= document, obj_in=checker)
if document.doc_type_id == 2:
return await ingest(request, unchecked_path)
return await process_ocr(request, unchecked_path)
elif document.doc_type_id == 3:
return await ingest(request, unchecked_path)
return await process_both_ocr(request, unchecked_path)
else:
return await ingest(request, unchecked_path)

View File

@ -45,7 +45,7 @@ class Document(BaseModel):
class DocumentMakerChecker(DocumentCreate):
action_type: str
status: str
doc_type_id: int
class DocumentMakerCreate(DocumentMakerChecker):
pass

View File

@ -50,6 +50,8 @@ boto3 = {version ="^1.34.51", optional = true}
# Optional UI
gradio = {version ="^4.19.2", optional = true}
aiofiles = "^23.2.1"
timm = "^0.9.16"
[tool.poetry.extras]
ui = ["gradio"]