mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-07-07 12:28:51 +00:00
Added routes for pdf ocr
This commit is contained in:
parent
91ebce47d4
commit
d849ee76f4
@ -1,42 +0,0 @@
|
||||
"""Create document
|
||||
|
||||
Revision ID: a5839618494c
|
||||
Revises:
|
||||
Create Date: 2024-02-11 12:35:13.347853
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'a5839618494c'
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('document',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('filename', sa.String(length=225), nullable=False),
|
||||
sa.Column('uploaded_by', sa.Integer(), nullable=False),
|
||||
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('filename')
|
||||
)
|
||||
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
|
||||
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
|
||||
op.drop_index(op.f('ix_document_id'), table_name='document')
|
||||
op.drop_table('document')
|
||||
# ### end Alembic commands ###
|
100
alembic/versions/dcf96cb11a85_create_models.py
Normal file
100
alembic/versions/dcf96cb11a85_create_models.py
Normal file
@ -0,0 +1,100 @@
|
||||
"""Create models
|
||||
|
||||
Revision ID: dcf96cb11a85
|
||||
Revises:
|
||||
Create Date: 2024-02-14 16:30:51.094285
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'dcf96cb11a85'
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('companies',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('name', sa.String(), nullable=True),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
|
||||
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
|
||||
op.create_table('roles',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('name', sa.String(length=100), nullable=True),
|
||||
sa.Column('description', sa.Text(), nullable=True),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
|
||||
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
|
||||
op.create_table('subscriptions',
|
||||
sa.Column('sub_id', sa.Integer(), nullable=False),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.Column('start_date', sa.DateTime(), nullable=True),
|
||||
sa.Column('end_date', sa.DateTime(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.PrimaryKeyConstraint('sub_id')
|
||||
)
|
||||
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
|
||||
op.create_table('users',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('email', sa.String(length=225), nullable=False),
|
||||
sa.Column('hashed_password', sa.String(), nullable=False),
|
||||
sa.Column('fullname', sa.String(length=225), nullable=False),
|
||||
sa.Column('is_active', sa.Boolean(), nullable=True),
|
||||
sa.Column('last_login', sa.DateTime(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('email'),
|
||||
sa.UniqueConstraint('fullname'),
|
||||
sa.UniqueConstraint('fullname', name='unique_username_no_spacing')
|
||||
)
|
||||
op.create_table('document',
|
||||
sa.Column('id', sa.Integer(), nullable=False),
|
||||
sa.Column('filename', sa.String(length=225), nullable=False),
|
||||
sa.Column('uploaded_by', sa.Integer(), nullable=False),
|
||||
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('filename')
|
||||
)
|
||||
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
|
||||
op.create_table('user_roles',
|
||||
sa.Column('user_id', sa.Integer(), nullable=False),
|
||||
sa.Column('role_id', sa.Integer(), nullable=False),
|
||||
sa.Column('company_id', sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
|
||||
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('user_roles')
|
||||
op.drop_index(op.f('ix_document_id'), table_name='document')
|
||||
op.drop_table('document')
|
||||
op.drop_table('users')
|
||||
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
|
||||
op.drop_table('subscriptions')
|
||||
op.drop_index(op.f('ix_roles_name'), table_name='roles')
|
||||
op.drop_index(op.f('ix_roles_id'), table_name='roles')
|
||||
op.drop_table('roles')
|
||||
op.drop_index(op.f('ix_companies_name'), table_name='companies')
|
||||
op.drop_index(op.f('ix_companies_id'), table_name='companies')
|
||||
op.drop_table('companies')
|
||||
# ### end Alembic commands ###
|
1806
poetry.lock
generated
1806
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,19 @@
|
||||
from fastapi import FastAPI, File, UploadFile, Response, APIRouter
|
||||
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel
|
||||
from docx import Document
|
||||
import os
|
||||
import fitz
|
||||
|
||||
import requests
|
||||
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
||||
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
||||
|
||||
upload_dir = rf"F:\LLM\privateGPT\private_gpt\uploads"
|
||||
from private_gpt.server.ingest.ingest_router import ingest_file
|
||||
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
|
||||
|
||||
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
|
||||
|
||||
@pdf_router.post("/pdf_ocr")
|
||||
async def get_pdf_ocr(file: UploadFile = File(...)):
|
||||
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
|
||||
UPLOAD_DIR = upload_dir
|
||||
try:
|
||||
contents = await file.read()
|
||||
@ -49,7 +49,16 @@ async def get_pdf_ocr(file: UploadFile = File(...)):
|
||||
doc.add_paragraph(table_data)
|
||||
# remove image file
|
||||
|
||||
doc.save(os.path.join(UPLOAD_DIR, "ocr_result.docx"))
|
||||
|
||||
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
|
||||
doc.save(save_path)
|
||||
|
||||
with open(save_path,'rb') as f:
|
||||
file_content = f.read()
|
||||
starfleet_data = {
|
||||
"filename": f.name,
|
||||
"file_content": file_content,
|
||||
"file_type": "multipart/form-data"
|
||||
}
|
||||
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
|
||||
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
|
||||
|
||||
|
@ -42,8 +42,10 @@ def create_app(root_injector: Injector) -> FastAPI:
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_credentials=True,
|
||||
allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80",
|
||||
"http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88", "http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173"],
|
||||
allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80", "http://127.0.0.1",
|
||||
"http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88",
|
||||
"http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173", "http://127.0.0.1/", "http://localhost/",
|
||||
"http://localhost:80", "http://192.168.1.131:80/", "http://192.168.1.131"],
|
||||
allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
@ -178,6 +178,60 @@ def ingest_file(
|
||||
)) -> IngestResponse:
|
||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||
service = request.state.injector.get(IngestService)
|
||||
print("-------------------------------------->",file)
|
||||
try:
|
||||
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
||||
if file_ingested:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_409_CONFLICT,
|
||||
detail="File already exists. Choose a different file.",
|
||||
)
|
||||
|
||||
if file.filename is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="No file name provided",
|
||||
)
|
||||
|
||||
try:
|
||||
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
|
||||
crud.documents.create(db=db, obj_in=docs_in)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Unable to upload file.",
|
||||
)
|
||||
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
|
||||
|
||||
with open(upload_path, "wb") as f:
|
||||
f.write(file.file.read())
|
||||
|
||||
with open(upload_path, "rb") as f:
|
||||
ingested_documents = service.ingest_bin_data(file.filename, f)
|
||||
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
|
||||
|
||||
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
||||
print("ERROR: ", e)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail="Internal Server Error: Unable to ingest file.",
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
def ingest_pdf_file(
|
||||
request: Request,
|
||||
db: Session = Depends(deps.get_db),
|
||||
file: UploadFile = File(...),
|
||||
) -> IngestResponse:
|
||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||
service = request.state.injector.get(IngestService)
|
||||
|
||||
try:
|
||||
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
||||
|
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
|
||||
from private_gpt.users.utils import send_registration_email, Ldap
|
||||
|
||||
LDAP_SERVER = settings.LDAP_SERVER
|
||||
LDAP_ENABLE = True
|
||||
LDAP_ENABLE = False
|
||||
|
||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||
|
||||
@ -104,17 +104,17 @@ def login_access_token(
|
||||
"""
|
||||
OAuth2 compatible token login, get an access token for future requests
|
||||
"""
|
||||
if LDAP_ENABLE:
|
||||
existing_user = crud.user.get_by_email(db, email=form_data.username)
|
||||
# if LDAP_ENABLE:
|
||||
# existing_user = crud.user.get_by_email(db, email=form_data.username)
|
||||
|
||||
if existing_user:
|
||||
if existing_user.user_role.role.name == "SUPER_ADMIN":
|
||||
pass
|
||||
else:
|
||||
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||
else:
|
||||
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||
ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
|
||||
# if existing_user:
|
||||
# if existing_user.user_role.role.name == "SUPER_ADMIN":
|
||||
# pass
|
||||
# else:
|
||||
# ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||
# else:
|
||||
# ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||
# ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
|
||||
|
||||
user = crud.user.authenticate(
|
||||
db, email=form_data.username, password=form_data.password
|
||||
@ -212,7 +212,7 @@ def register(
|
||||
detail="The user with this email already exists!",
|
||||
)
|
||||
random_password = security.generate_random_password()
|
||||
|
||||
# random_password = password
|
||||
try:
|
||||
if company_id:
|
||||
# Registering user with a specific company
|
||||
|
@ -24,7 +24,10 @@ python-jose = "^3.3.0"
|
||||
psycopg2-binary = "^2.9.9"
|
||||
passlib = "^1.7.4"
|
||||
docx2txt = "^0.8"
|
||||
torch = "2.1.2"
|
||||
gradio = "^4.18.0"
|
||||
ldap3 = "^2.9.1"
|
||||
uvicorn = "^0.27.1"
|
||||
python-doctr = {extras = ["torch"], version = "^0.7.0"}
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^22"
|
||||
|
Loading…
Reference in New Issue
Block a user