mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-07-07 20:39:13 +00:00
Added routes for pdf ocr
This commit is contained in:
parent
91ebce47d4
commit
d849ee76f4
@ -1,42 +0,0 @@
|
|||||||
"""Create document
|
|
||||||
|
|
||||||
Revision ID: a5839618494c
|
|
||||||
Revises:
|
|
||||||
Create Date: 2024-02-11 12:35:13.347853
|
|
||||||
|
|
||||||
"""
|
|
||||||
from typing import Sequence, Union
|
|
||||||
|
|
||||||
from alembic import op
|
|
||||||
import sqlalchemy as sa
|
|
||||||
|
|
||||||
|
|
||||||
# revision identifiers, used by Alembic.
|
|
||||||
revision: str = 'a5839618494c'
|
|
||||||
down_revision: Union[str, None] = None
|
|
||||||
branch_labels: Union[str, Sequence[str], None] = None
|
|
||||||
depends_on: Union[str, Sequence[str], None] = None
|
|
||||||
|
|
||||||
|
|
||||||
def upgrade() -> None:
|
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
|
||||||
op.create_table('document',
|
|
||||||
sa.Column('id', sa.Integer(), nullable=False),
|
|
||||||
sa.Column('filename', sa.String(length=225), nullable=False),
|
|
||||||
sa.Column('uploaded_by', sa.Integer(), nullable=False),
|
|
||||||
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
|
|
||||||
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
|
|
||||||
sa.PrimaryKeyConstraint('id'),
|
|
||||||
sa.UniqueConstraint('filename')
|
|
||||||
)
|
|
||||||
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
|
|
||||||
# op.create_unique_constraint('unique_user_role', 'user_roles', ['user_id', 'role_id', 'company_id'])
|
|
||||||
# ### end Alembic commands ###
|
|
||||||
|
|
||||||
|
|
||||||
def downgrade() -> None:
|
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
|
||||||
# op.drop_constraint('unique_user_role', 'user_roles', type_='unique')
|
|
||||||
op.drop_index(op.f('ix_document_id'), table_name='document')
|
|
||||||
op.drop_table('document')
|
|
||||||
# ### end Alembic commands ###
|
|
100
alembic/versions/dcf96cb11a85_create_models.py
Normal file
100
alembic/versions/dcf96cb11a85_create_models.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
"""Create models
|
||||||
|
|
||||||
|
Revision ID: dcf96cb11a85
|
||||||
|
Revises:
|
||||||
|
Create Date: 2024-02-14 16:30:51.094285
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'dcf96cb11a85'
|
||||||
|
down_revision: Union[str, None] = None
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.create_table('companies',
|
||||||
|
sa.Column('id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('name', sa.String(), nullable=True),
|
||||||
|
sa.PrimaryKeyConstraint('id')
|
||||||
|
)
|
||||||
|
op.create_index(op.f('ix_companies_id'), 'companies', ['id'], unique=False)
|
||||||
|
op.create_index(op.f('ix_companies_name'), 'companies', ['name'], unique=True)
|
||||||
|
op.create_table('roles',
|
||||||
|
sa.Column('id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('name', sa.String(length=100), nullable=True),
|
||||||
|
sa.Column('description', sa.Text(), nullable=True),
|
||||||
|
sa.PrimaryKeyConstraint('id')
|
||||||
|
)
|
||||||
|
op.create_index(op.f('ix_roles_id'), 'roles', ['id'], unique=False)
|
||||||
|
op.create_index(op.f('ix_roles_name'), 'roles', ['name'], unique=False)
|
||||||
|
op.create_table('subscriptions',
|
||||||
|
sa.Column('sub_id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||||
|
sa.Column('start_date', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('end_date', sa.DateTime(), nullable=True),
|
||||||
|
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||||
|
sa.PrimaryKeyConstraint('sub_id')
|
||||||
|
)
|
||||||
|
op.create_index(op.f('ix_subscriptions_sub_id'), 'subscriptions', ['sub_id'], unique=False)
|
||||||
|
op.create_table('users',
|
||||||
|
sa.Column('id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('email', sa.String(length=225), nullable=False),
|
||||||
|
sa.Column('hashed_password', sa.String(), nullable=False),
|
||||||
|
sa.Column('fullname', sa.String(length=225), nullable=False),
|
||||||
|
sa.Column('is_active', sa.Boolean(), nullable=True),
|
||||||
|
sa.Column('last_login', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('created_at', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
||||||
|
sa.Column('company_id', sa.Integer(), nullable=True),
|
||||||
|
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||||
|
sa.PrimaryKeyConstraint('id'),
|
||||||
|
sa.UniqueConstraint('email'),
|
||||||
|
sa.UniqueConstraint('fullname'),
|
||||||
|
sa.UniqueConstraint('fullname', name='unique_username_no_spacing')
|
||||||
|
)
|
||||||
|
op.create_table('document',
|
||||||
|
sa.Column('id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('filename', sa.String(length=225), nullable=False),
|
||||||
|
sa.Column('uploaded_by', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('uploaded_at', sa.DateTime(), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(['uploaded_by'], ['users.id'], ),
|
||||||
|
sa.PrimaryKeyConstraint('id'),
|
||||||
|
sa.UniqueConstraint('filename')
|
||||||
|
)
|
||||||
|
op.create_index(op.f('ix_document_id'), 'document', ['id'], unique=False)
|
||||||
|
op.create_table('user_roles',
|
||||||
|
sa.Column('user_id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('role_id', sa.Integer(), nullable=False),
|
||||||
|
sa.Column('company_id', sa.Integer(), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(['company_id'], ['companies.id'], ),
|
||||||
|
sa.ForeignKeyConstraint(['role_id'], ['roles.id'], ),
|
||||||
|
sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
|
||||||
|
sa.PrimaryKeyConstraint('user_id', 'role_id', 'company_id'),
|
||||||
|
sa.UniqueConstraint('user_id', 'role_id', 'company_id', name='unique_user_role')
|
||||||
|
)
|
||||||
|
# ### end Alembic commands ###
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.drop_table('user_roles')
|
||||||
|
op.drop_index(op.f('ix_document_id'), table_name='document')
|
||||||
|
op.drop_table('document')
|
||||||
|
op.drop_table('users')
|
||||||
|
op.drop_index(op.f('ix_subscriptions_sub_id'), table_name='subscriptions')
|
||||||
|
op.drop_table('subscriptions')
|
||||||
|
op.drop_index(op.f('ix_roles_name'), table_name='roles')
|
||||||
|
op.drop_index(op.f('ix_roles_id'), table_name='roles')
|
||||||
|
op.drop_table('roles')
|
||||||
|
op.drop_index(op.f('ix_companies_name'), table_name='companies')
|
||||||
|
op.drop_index(op.f('ix_companies_id'), table_name='companies')
|
||||||
|
op.drop_table('companies')
|
||||||
|
# ### end Alembic commands ###
|
1806
poetry.lock
generated
1806
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,19 @@
|
|||||||
from fastapi import FastAPI, File, UploadFile, Response, APIRouter
|
from fastapi import FastAPI, File, UploadFile, Response, APIRouter, Request
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from docx import Document
|
from docx import Document
|
||||||
import os
|
import os
|
||||||
import fitz
|
import fitz
|
||||||
|
import requests
|
||||||
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
from private_gpt.components.ocr_components.TextExtraction import ImageToTable
|
||||||
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
from private_gpt.components.ocr_components.table_ocr import GetOCRText
|
||||||
|
from private_gpt.server.ingest.ingest_router import ingest_file
|
||||||
upload_dir = rf"F:\LLM\privateGPT\private_gpt\uploads"
|
upload_dir = rf"C:\Users\ASUS\Desktop\QuickGPT\backend\privateGPT\private_gpt\uploads"
|
||||||
|
|
||||||
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
|
pdf_router = APIRouter(prefix="/pdf", tags=["auth"])
|
||||||
|
|
||||||
@pdf_router.post("/pdf_ocr")
|
@pdf_router.post("/pdf_ocr")
|
||||||
async def get_pdf_ocr(file: UploadFile = File(...)):
|
async def get_pdf_ocr(request: Request, file: UploadFile = File(...)):
|
||||||
UPLOAD_DIR = upload_dir
|
UPLOAD_DIR = upload_dir
|
||||||
try:
|
try:
|
||||||
contents = await file.read()
|
contents = await file.read()
|
||||||
@ -49,7 +49,16 @@ async def get_pdf_ocr(file: UploadFile = File(...)):
|
|||||||
doc.add_paragraph(table_data)
|
doc.add_paragraph(table_data)
|
||||||
# remove image file
|
# remove image file
|
||||||
|
|
||||||
doc.save(os.path.join(UPLOAD_DIR, "ocr_result.docx"))
|
save_path = os.path.join(UPLOAD_DIR, "ocr_result.docx")
|
||||||
|
doc.save(save_path)
|
||||||
|
|
||||||
|
with open(save_path,'rb') as f:
|
||||||
|
file_content = f.read()
|
||||||
|
starfleet_data = {
|
||||||
|
"filename": f.name,
|
||||||
|
"file_content": file_content,
|
||||||
|
"file_type": "multipart/form-data"
|
||||||
|
}
|
||||||
|
requests.post('http://127.0.0.1:88/pdf/pdf_ocr', json=starfleet_data,headers={"Content-Type":"multipart/form-data"})
|
||||||
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
|
return FileResponse(path=os.path.join(UPLOAD_DIR, "ocr_result.docx"), filename="ocr_result.docx", media_type="application/pdf")
|
||||||
|
|
||||||
|
@ -42,8 +42,10 @@ def create_app(root_injector: Injector) -> FastAPI:
|
|||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
allow_credentials=True,
|
allow_credentials=True,
|
||||||
allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80",
|
allow_origins=["http://localhost:80/", "http://10.1.101.125:80", "http://quickgpt.gibl.com.np:80", "http://127.0.0.1",
|
||||||
"http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88", "http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173"],
|
"http://10.1.101.125", "http://quickgpt.gibl.com.np", "http://localhost:8001", "http://192.168.1.93", "http://192.168.1.93:88",
|
||||||
|
"http://192.168.1.98", "http://192.168.1.98:5173", "http://localhost:5173", "http://127.0.0.1/", "http://localhost/",
|
||||||
|
"http://localhost:80", "http://192.168.1.131:80/", "http://192.168.1.131"],
|
||||||
allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"],
|
allow_methods=["DELETE", "GET", "POST", "PUT", "OPTIONS", "PATCH"],
|
||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
@ -178,6 +178,60 @@ def ingest_file(
|
|||||||
)) -> IngestResponse:
|
)) -> IngestResponse:
|
||||||
"""Ingests and processes a file, storing its chunks to be used as context."""
|
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||||
service = request.state.injector.get(IngestService)
|
service = request.state.injector.get(IngestService)
|
||||||
|
print("-------------------------------------->",file)
|
||||||
|
try:
|
||||||
|
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
||||||
|
if file_ingested:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_409_CONFLICT,
|
||||||
|
detail="File already exists. Choose a different file.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if file.filename is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail="No file name provided",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
docs_in = schemas.DocumentCreate(filename=file.filename, uploaded_by=current_user.id)
|
||||||
|
crud.documents.create(db=db, obj_in=docs_in)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail="Unable to upload file.",
|
||||||
|
)
|
||||||
|
upload_path = Path(f"{UPLOAD_DIR}/{file.filename}")
|
||||||
|
|
||||||
|
with open(upload_path, "wb") as f:
|
||||||
|
f.write(file.file.read())
|
||||||
|
|
||||||
|
with open(upload_path, "rb") as f:
|
||||||
|
ingested_documents = service.ingest_bin_data(file.filename, f)
|
||||||
|
logger.info(f"{file.filename} is uploaded by the {current_user.fullname}.")
|
||||||
|
|
||||||
|
return IngestResponse(object="list", model="private-gpt", data=ingested_documents)
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"There was an error uploading the file(s): {str(e)}")
|
||||||
|
print("ERROR: ", e)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail="Internal Server Error: Unable to ingest file.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_pdf_file(
|
||||||
|
request: Request,
|
||||||
|
db: Session = Depends(deps.get_db),
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
) -> IngestResponse:
|
||||||
|
"""Ingests and processes a file, storing its chunks to be used as context."""
|
||||||
|
service = request.state.injector.get(IngestService)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
file_ingested = crud.documents.get_by_filename(db, file_name=file.filename)
|
||||||
|
@ -14,7 +14,7 @@ from private_gpt.users import crud, models, schemas
|
|||||||
from private_gpt.users.utils import send_registration_email, Ldap
|
from private_gpt.users.utils import send_registration_email, Ldap
|
||||||
|
|
||||||
LDAP_SERVER = settings.LDAP_SERVER
|
LDAP_SERVER = settings.LDAP_SERVER
|
||||||
LDAP_ENABLE = True
|
LDAP_ENABLE = False
|
||||||
|
|
||||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||||
|
|
||||||
@ -104,17 +104,17 @@ def login_access_token(
|
|||||||
"""
|
"""
|
||||||
OAuth2 compatible token login, get an access token for future requests
|
OAuth2 compatible token login, get an access token for future requests
|
||||||
"""
|
"""
|
||||||
if LDAP_ENABLE:
|
# if LDAP_ENABLE:
|
||||||
existing_user = crud.user.get_by_email(db, email=form_data.username)
|
# existing_user = crud.user.get_by_email(db, email=form_data.username)
|
||||||
|
|
||||||
if existing_user:
|
# if existing_user:
|
||||||
if existing_user.user_role.role.name == "SUPER_ADMIN":
|
# if existing_user.user_role.role.name == "SUPER_ADMIN":
|
||||||
pass
|
# pass
|
||||||
else:
|
# else:
|
||||||
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
# ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||||
else:
|
# else:
|
||||||
ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
# ldap = ldap_login(db=db, username=form_data.username, password=form_data.password)
|
||||||
ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
|
# ad_user_register(db=db, email=form_data.username,fullname=ldap, password=form_data.password)
|
||||||
|
|
||||||
user = crud.user.authenticate(
|
user = crud.user.authenticate(
|
||||||
db, email=form_data.username, password=form_data.password
|
db, email=form_data.username, password=form_data.password
|
||||||
@ -212,7 +212,7 @@ def register(
|
|||||||
detail="The user with this email already exists!",
|
detail="The user with this email already exists!",
|
||||||
)
|
)
|
||||||
random_password = security.generate_random_password()
|
random_password = security.generate_random_password()
|
||||||
|
# random_password = password
|
||||||
try:
|
try:
|
||||||
if company_id:
|
if company_id:
|
||||||
# Registering user with a specific company
|
# Registering user with a specific company
|
||||||
|
@ -24,7 +24,10 @@ python-jose = "^3.3.0"
|
|||||||
psycopg2-binary = "^2.9.9"
|
psycopg2-binary = "^2.9.9"
|
||||||
passlib = "^1.7.4"
|
passlib = "^1.7.4"
|
||||||
docx2txt = "^0.8"
|
docx2txt = "^0.8"
|
||||||
torch = "2.1.2"
|
gradio = "^4.18.0"
|
||||||
|
ldap3 = "^2.9.1"
|
||||||
|
uvicorn = "^0.27.1"
|
||||||
|
python-doctr = {extras = ["torch"], version = "^0.7.0"}
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = "^22"
|
black = "^22"
|
||||||
|
Loading…
Reference in New Issue
Block a user