mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-06-30 09:12:17 +00:00
Update with verify function to check the maker request for documents
This commit is contained in:
parent
2bcc7f589f
commit
f7de7c3b54
4
.env
4
.env
@ -4,8 +4,8 @@ ENVIRONMENT=dev
|
||||
DB_HOST=localhost
|
||||
DB_USER=postgres
|
||||
DB_PORT=5432
|
||||
DB_PASSWORD=admin
|
||||
DB_NAME=GPT
|
||||
DB_PASSWORD=quick
|
||||
DB_NAME=QuickGpt
|
||||
|
||||
SUPER_ADMIN_EMAIL=superadmin@email.com
|
||||
SUPER_ADMIN_PASSWORD=supersecretpassword
|
||||
|
@ -1,8 +1,8 @@
|
||||
"""udpate cascade
|
||||
"""update
|
||||
|
||||
Revision ID: 39c817e4fc4a
|
||||
Revises: cb320e7880fc
|
||||
Create Date: 2024-03-17 11:08:22.426368
|
||||
Revision ID: b7b896502e8e
|
||||
Revises:
|
||||
Create Date: 2024-03-17 15:07:10.795935
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
@ -12,16 +12,16 @@ import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = '39c817e4fc4a'
|
||||
down_revision: Union[str, None] = 'cb320e7880fc'
|
||||
revision: str = 'b7b896502e8e'
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
|
||||
op.drop_constraint('document_department_association_department_id_fkey', 'document_department_association', type_='foreignkey')
|
||||
op.drop_constraint('document_department_association_document_id_fkey', 'document_department_association', type_='foreignkey')
|
||||
op.create_foreign_key(None, 'document_department_association', 'document', ['document_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
|
||||
op.create_foreign_key(None, 'document_department_association', 'departments', ['department_id'], ['id'], onupdate='CASCADE', ondelete='CASCADE')
|
||||
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
|
||||
@ -33,6 +33,6 @@ def downgrade() -> None:
|
||||
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
|
||||
op.drop_constraint(None, 'document_department_association', type_='foreignkey')
|
||||
op.drop_constraint(None, 'document_department_association', type_='foreignkey')
|
||||
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
|
||||
op.create_foreign_key('document_department_association_document_id_fkey', 'document_department_association', 'document', ['document_id'], ['id'])
|
||||
op.create_foreign_key('document_department_association_department_id_fkey', 'document_department_association', 'departments', ['department_id'], ['id'])
|
||||
# ### end Alembic commands ###
|
@ -1,157 +0,0 @@
|
||||
"""update
|
||||
|
||||
Revision ID: cb320e7880fc
|
||||
Revises:
|
||||
Create Date: 2024-03-17 10:09:40.034197
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'cb320e7880fc'
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('companies',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('name', sa.String(), nullable=True),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
|
||||
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
|
||||
op.create_table('document_type',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('type', sa.String(length=225), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('type')
|
||||
)
|
||||
op.create_index(op.f('ix_document_type_id'), 'document_type', ['id'], unique=False)
|
||||
op.create_table('roles',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('name', sa.String(length=100), nullable=True),
|
||||
sa.Column('description', sa.Text(), nullable=True),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
|
||||
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
|
||||
op.create_table('departments',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('name', sa.String(), nullable=True),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.Column('total_users', sa.Integer(), nullable=True),
|
||||
sa.Column('total_documents', sa.Integer(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_departments_id'), 'departments', ['id'], unique=False)
|
||||
op.create_index(op.f('ix_departments_name'), 'departments', ['name'], unique=True)
|
||||
op.create_table('subscriptions',
|
||||
sa.Column('sub_id', sa.Integer(), nullable=False),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.Column('start_date', sa.DateTime(), nullable=True),
|
||||
sa.Column('end_date', sa.DateTime(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.PrimaryKeyConstraint('sub_id')
|
||||
)
|
||||
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
|
||||
op.create_table('users',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('email', sa.String(length=225), nullable=False),
|
||||
sa.Column('hashed_password', sa.String(), nullable=False),
|
||||
sa.Column('username', sa.String(length=225), nullable=False),
|
||||
sa.Column('is_active', sa.Boolean(), nullable=True),
|
||||
sa.Column('last_login', sa.DateTime(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('password_created', sa.DateTime(), nullable=True),
|
||||
sa.Column('checker', sa.Boolean(), nullable=True),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.Column('department_id', sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('email'),
|
||||
sa.UniqueConstraint('username'),
|
||||
sa.UniqueConstraint('username', name='unique_username_no_spacing')
|
||||
)
|
||||
op.create_table('audit',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('timestamp', sa.DateTime(), nullable=False),
|
||||
sa.Column('user_id', sa.Integer(), nullable=True),
|
||||
sa.Column('model', sa.String(length=100), nullable=False),
|
||||
sa.Column('action', sa.String(length=50), nullable=False),
|
||||
sa.Column('details', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
||||
sa.Column('ip_address', sa.String(length=45), nullable=True),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='SET NULL'),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_audit_id'), 'audit', ['id'], unique=False)
|
||||
op.create_table('document',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('filename', sa.String(length=225), nullable=False),
|
||||
sa.Column('uploaded_by', sa.Integer(), nullable=False),
|
||||
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
|
||||
sa.Column('is_enabled', sa.Boolean(), nullable=True),
|
||||
sa.Column('verified', sa.Boolean(), nullable=True),
|
||||
sa.Column('doc_type_id', sa.Integer(), nullable=True),
|
||||
sa.Column('action_type', sa.Enum('INSERT', 'UPDATE', 'DELETE', name='makercheckeractiontype'), nullable=False),
|
||||
sa.Column('status', sa.Enum('PENDING', 'APPROVED', 'REJECTED', name='makercheckerstatus'), nullable=False),
|
||||
sa.Column('verified_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('verified_by', sa.Integer(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['doc_type_id'], ['document_type.id'], ),
|
||||
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
|
||||
sa.ForeignKeyConstraint(['verified_by'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('filename')
|
||||
)
|
||||
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
|
||||
op.create_table('user_roles',
|
||||
sa.Column('user_id', sa.Integer(), nullable=False),
|
||||
sa.Column('role_id', sa.Integer(), nullable=False),
|
||||
sa.Column('company_id', sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
|
||||
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
|
||||
)
|
||||
op.create_table('document_department_association',
|
||||
sa.Column('department_id', sa.Integer(), nullable=True),
|
||||
sa.Column('document_id', sa.Integer(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['department_id'], ['departments.id'], ),
|
||||
sa.ForeignKeyConstraint(['document_id'], ['document.id'], )
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('document_department_association')
|
||||
op.drop_table('user_roles')
|
||||
op.drop_index(op.f('ix_document_id'), table_name='document')
|
||||
op.drop_table('document')
|
||||
op.drop_index(op.f('ix_audit_id'), table_name='audit')
|
||||
op.drop_table('audit')
|
||||
op.drop_table('users')
|
||||
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
|
||||
op.drop_table('subscriptions')
|
||||
op.drop_index(op.f('ix_departments_name'), table_name='departments')
|
||||
op.drop_index(op.f('ix_departments_id'), table_name='departments')
|
||||
op.drop_table('departments')
|
||||
op.drop_index(op.f('ix_roles_name'), table_name='roles')
|
||||
op.drop_index(op.f('ix_roles_id'), table_name='roles')
|
||||
op.drop_table('roles')
|
||||
op.drop_index(op.f('ix_document_type_id'), table_name='document_type')
|
||||
op.drop_table('document_type')
|
||||
op.drop_index(op.f('ix_companies_name'), table_name='companies')
|
||||
op.drop_index(op.f('ix_companies_id'), table_name='companies')
|
||||
op.drop_table('companies')
|
||||
# ### end Alembic commands ###
|
BIN
ffmpy-0.3.2-py3-none-any.whl
Normal file
BIN
ffmpy-0.3.2-py3-none-any.whl
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
2
models/.gitignore
vendored
2
models/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
*
|
||||
!.gitignore
|
1714
poetry.lock
generated
1714
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,7 @@ from private_gpt.users import models, schemas
|
||||
from private_gpt.users.constants.role import Role
|
||||
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
||||
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
||||
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse
|
||||
from private_gpt.server.ingest.ingest_router import common_ingest_logic, IngestResponse, ingest
|
||||
pdf_router = APIRouter(prefix="/v1", tags=["ocr"])
|
||||
|
||||
|
||||
@ -80,9 +80,7 @@ async def process_pdf_ocr(
|
||||
):
|
||||
UPLOAD_DIR = OCR_UPLOAD
|
||||
try:
|
||||
print("The file name is: ", file.filename)
|
||||
pdf_path = await save_uploaded_file(file, UPLOAD_DIR)
|
||||
print("The file path: ", pdf_path)
|
||||
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
|
||||
ingested_documents = await common_ingest_logic(
|
||||
request=request, db=db, ocr_file=ocr_doc_path, current_user=current_user, original_file=None, log_audit=log_audit, departments=departments
|
||||
@ -96,6 +94,43 @@ async def process_pdf_ocr(
|
||||
detail=f"There was an error processing OCR: {e}"
|
||||
)
|
||||
|
||||
|
||||
async def process_ocr(
|
||||
request: Request,
|
||||
pdf_path: str,
|
||||
):
|
||||
UPLOAD_DIR = OCR_UPLOAD
|
||||
try:
|
||||
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
|
||||
ingested_documents = await ingest(request=request, file_path=ocr_doc_path)
|
||||
return ingested_documents
|
||||
except Exception as e:
|
||||
print(traceback.print_exc())
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"There was an error processing OCR: {e}"
|
||||
)
|
||||
|
||||
async def process_both_ocr(
|
||||
request: Request,
|
||||
pdf_path: str
|
||||
):
|
||||
UPLOAD_DIR = OCR_UPLOAD
|
||||
try:
|
||||
ocr_doc_path = await process_images_and_generate_doc(request, pdf_path, UPLOAD_DIR)
|
||||
ingested_ocr_documents = await ingest(request=request, file_path=ocr_doc_path) # ingest ocr
|
||||
ingested_documents = await ingest(request=request, file_path=pdf_path) # ingest pdf
|
||||
return ingested_documents
|
||||
|
||||
except Exception as e:
|
||||
print(traceback.print_exc())
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"There was an error processing OCR: {e}"
|
||||
)
|
||||
|
||||
|
||||
|
||||
async def process_both(
|
||||
request: Request,
|
||||
db: Session,
|
||||
|
@ -44,13 +44,13 @@ class IngestResponse(BaseModel):
|
||||
class DeleteFilename(BaseModel):
|
||||
filename: str
|
||||
|
||||
@ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
|
||||
def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
||||
"""Ingests and processes a file.
|
||||
# @ingest_router.post("/ingest", tags=["Ingestion"], deprecated=True)
|
||||
# def ingest(request: Request, file: UploadFile) -> IngestResponse:
|
||||
# """Ingests and processes a file.
|
||||
|
||||
Deprecated. Use ingest/file instead.
|
||||
"""
|
||||
return ingest_file(request, file)
|
||||
# Deprecated. Use ingest/file instead.
|
||||
# """
|
||||
# return ingest_file(request, file)
|
||||
|
||||
|
||||
@ingest_router.post("/ingest/file1", tags=["Ingestion"])
|
||||
@ -205,7 +205,8 @@ async def create_documents(
|
||||
filename=file_name,
|
||||
uploaded_by=current_user.id,
|
||||
action_type=MakerCheckerActionType.INSERT,
|
||||
status=MakerCheckerStatus.PENDING
|
||||
status=MakerCheckerStatus.PENDING,
|
||||
doc_type_id=departments.doc_type_id,
|
||||
)
|
||||
print("DOCUMENT CREATE: ", docs_in)
|
||||
document = crud.documents.create(db=db, obj_in=docs_in)
|
||||
@ -298,35 +299,26 @@ async def common_ingest_logic(
|
||||
|
||||
|
||||
async def ingest(request: Request, file_path: str) -> IngestResponse:
|
||||
"""Ingests and processes a file, storing its chunks to be used as context.
|
||||
|
||||
The context obtained from files is later used in
|
||||
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
||||
|
||||
Most common document
|
||||
formats are supported, but you may be prompted to install an extra dependency to
|
||||
manage a specific file type.
|
||||
|
||||
A file can generate different Documents (for example a PDF generates one Document
|
||||
per page). All Documents IDs are returned in the response, together with the
|
||||
extracted Metadata (which is later used to improve context retrieval). Those IDs
|
||||
can be used to filter the context used to create responses in
|
||||
`/chat/completions`, `/completions`, and `/chunks` APIs.
|
||||
"""
|
||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||
service = request.state.injector.get(IngestService)
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as file:
|
||||
file_name = Path(file_path).name
|
||||
upload_path = Path(f"{UPLOAD_DIR}/{file_name}")
|
||||
|
||||
with open(upload_path, "wb") as f:
|
||||
f.write(file.file.read())
|
||||
with open(upload_path, "rb") as f:
|
||||
ingested_documents = service.ingest_bin_data(file.filename, f)
|
||||
|
||||
with upload_path.open('wb') as f:
|
||||
f.write(file.read())
|
||||
|
||||
with upload_path.open('rb') as f:
|
||||
ingested_documents = await service.ingest_bin_data(file_name, f)
|
||||
|
||||
except Exception as e:
|
||||
return {"message": f"There was an error uploading the file(s)\n {e}"}
|
||||
|
||||
finally:
|
||||
file.file.close()
|
||||
upload_path.unlink(missing_ok=True)
|
||||
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
|
||||
|
||||
|
@ -72,7 +72,7 @@ class IngestService:
|
||||
logger.debug("Ingesting text data with file_name=%s", file_name)
|
||||
return self._ingest_data(file_name, text)
|
||||
|
||||
def ingest_bin_data(
|
||||
async def ingest_bin_data(
|
||||
self, file_name: str, raw_file_data: BinaryIO
|
||||
) -> list[IngestedDoc]:
|
||||
logger.debug("Ingesting binary data with file_name=%s", file_name)
|
||||
|
@ -15,6 +15,7 @@ from private_gpt.users.constants.role import Role
|
||||
from private_gpt.users import crud, models, schemas
|
||||
from private_gpt.server.ingest.ingest_router import create_documents, ingest
|
||||
from private_gpt.users.models.document import MakerCheckerActionType, MakerCheckerStatus
|
||||
from private_gpt.components.ocr_components.table_ocr_api import process_both_ocr, process_ocr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix='/documents', tags=['Documents'])
|
||||
@ -262,7 +263,7 @@ async def upload_documents(
|
||||
)
|
||||
|
||||
|
||||
@router.post('/verify', response_model=schemas.Document)
|
||||
@router.post('/verify')
|
||||
async def verify_documents(
|
||||
request: Request,
|
||||
checker_in: schemas.DocumentUpdate = Depends(),
|
||||
@ -284,8 +285,7 @@ async def verify_documents(
|
||||
detail="Document not found!",
|
||||
)
|
||||
unchecked_path = Path(f"{UNCHECKED_DIR}/{document.filename}")
|
||||
print(checker_in.status)
|
||||
print(MakerCheckerStatus.APPROVED.value)
|
||||
|
||||
if checker_in.status == MakerCheckerStatus.APPROVED.value:
|
||||
checker = schemas.DocumentCheckerUpdate(
|
||||
status=MakerCheckerStatus.APPROVED,
|
||||
@ -296,9 +296,9 @@ async def verify_documents(
|
||||
crud.documents.update(db=db, db_obj= document, obj_in=checker)
|
||||
|
||||
if document.doc_type_id == 2:
|
||||
return await ingest(request, unchecked_path)
|
||||
return await process_ocr(request, unchecked_path)
|
||||
elif document.doc_type_id == 3:
|
||||
return await ingest(request, unchecked_path)
|
||||
return await process_both_ocr(request, unchecked_path)
|
||||
else:
|
||||
return await ingest(request, unchecked_path)
|
||||
|
||||
|
@ -45,7 +45,7 @@ class Document(BaseModel):
|
||||
class DocumentMakerChecker(DocumentCreate):
|
||||
action_type: str
|
||||
status: str
|
||||
|
||||
doc_type_id: int
|
||||
|
||||
class DocumentMakerCreate(DocumentMakerChecker):
|
||||
pass
|
||||
|
@ -50,6 +50,8 @@ boto3 = {version ="^1.34.51", optional = true}
|
||||
|
||||
# Optional UI
|
||||
gradio = {version ="^4.19.2", optional = true}
|
||||
aiofiles = "^23.2.1"
|
||||
timm = "^0.9.16"
|
||||
|
||||
[tool.poetry.extras]
|
||||
ui = ["gradio"]
|
||||
|
Loading…
Reference in New Issue
Block a user