mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-03 16:25:55 +00:00
[dev] add support for all image pages inside pdf
This commit is contained in:
@@ -2,15 +2,246 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:03:18.404518Z",
|
||||
"start_time": "2024-09-05T14:03:18.401535Z"
|
||||
}
|
||||
},
|
||||
"source": "# Read PDF",
|
||||
"outputs": [],
|
||||
"execution_count": 33
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:39:16.282184Z",
|
||||
"start_time": "2024-09-05T14:39:16.279186Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
"import pdf2image\n",
|
||||
"import pytesseract\n",
|
||||
"from pytesseract import Output, TesseractError"
|
||||
],
|
||||
"id": "ad1ffea2dcb7dcaf",
|
||||
"outputs": [],
|
||||
"execution_count": 74
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:39:16.648563Z",
|
||||
"start_time": "2024-09-05T14:39:16.646494Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "file = \"../local_data/input_raw/test/26223.pdf\"",
|
||||
"id": "f38556f4ef09d669",
|
||||
"outputs": [],
|
||||
"execution_count": 75
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:39:23.492339Z",
|
||||
"start_time": "2024-09-05T14:39:18.280277Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "images = pdf2image.convert_from_path(file)",
|
||||
"id": "67286a6f741debb0",
|
||||
"outputs": [],
|
||||
"execution_count": 76
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:39:38.005863Z",
|
||||
"start_time": "2024-09-05T14:39:36.119195Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"pil_im = images[5] # assuming that we're interested in the first page only\n",
|
||||
"\n",
|
||||
"ocr_dict = pytesseract.image_to_string(pil_im, lang=\"rus\")"
|
||||
],
|
||||
"id": "ec339f7da13fc37f",
|
||||
"outputs": [],
|
||||
"execution_count": 79
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-09-05T14:39:49.348914Z",
|
||||
"start_time": "2024-09-05T14:39:49.344588Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "print(ocr_dict)",
|
||||
"id": "df86f1ed6f5f1b6e",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"096 отдельный батальон материального обеспечения\n",
|
||||
"\n",
|
||||
"1099 мотострелковый полк\n",
|
||||
"\n",
|
||||
"| береговая ракетно-артиллерийская бригада\n",
|
||||
"\n",
|
||||
"военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"гвардейская отдельная десантно-штурмовая бригада\n",
|
||||
"гвардейская отдельная инженерная бригада\n",
|
||||
"отдельная вертолетная эскадрилья\n",
|
||||
"отдельная танковая бригада\n",
|
||||
"отдельный медицинский батальон\n",
|
||||
"отдельный танковый полк\n",
|
||||
"\n",
|
||||
"полк радиационной, химической и биологической защиты\n",
|
||||
"\n",
|
||||
"смешанный авиационный полк\n",
|
||||
"средняя общеобразовательная школа\n",
|
||||
"\n",
|
||||
"центральный узел контроля безопасности связи\n",
|
||||
"\n",
|
||||
"11 экипаж большой подводной лодки\n",
|
||||
"\n",
|
||||
"110 военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"110 военное представительство Министерства обороны Российской Федерации\n",
|
||||
"\n",
|
||||
"110 отдельная мотострелковая бригада\n",
|
||||
"110 отдельный стрелковый полк\n",
|
||||
"\n",
|
||||
"1101 отдел государственного технического надзора\n",
|
||||
"\n",
|
||||
"1102 мотострелковый полк\n",
|
||||
"1104 мотострелковый полк\n",
|
||||
"\n",
|
||||
"144\n",
|
||||
"\n",
|
||||
"1105 мотострелковый полк\n",
|
||||
"\n",
|
||||
"109 отдельный оптико-электронный узел\n",
|
||||
"\n",
|
||||
"11 военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"П главный государственный центр судеоно-медицинских и криминалистических экспертиз Министерства\n",
|
||||
"\n",
|
||||
"|| отдельный стрелковый полк\n",
|
||||
"\n",
|
||||
"11 центральная база резерва танков\n",
|
||||
"\n",
|
||||
"110 объединенное управление эксплуатации специальных объектов\n",
|
||||
"\n",
|
||||
"117 зенитный ракетный полк\n",
|
||||
"\n",
|
||||
"152|1118 военное представительство Министерства обороны Российской Федерации\n",
|
||||
"153|1118 отдельный радиолокационный узел\n",
|
||||
"\n",
|
||||
"154|112 авиационный полигон\n",
|
||||
"\n",
|
||||
"155112 гвардейская ракетная бригада\n",
|
||||
"\n",
|
||||
"156| 112 отдельный вертолетный полк\n",
|
||||
"\n",
|
||||
"157|112 отдельный стрелковый полк\n",
|
||||
"\n",
|
||||
"158|1122 отдельный батальон материального обеспечения\n",
|
||||
"159|1124 отдельный батальон материального обеспечения\n",
|
||||
"160|1127 ремонтный завод ракетно-артиллерийского вооружения\n",
|
||||
"161|113 военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"162|1139 отдельный батальон материального обеспечения\n",
|
||||
"163|1139 отдельный измерительный пункт\n",
|
||||
"\n",
|
||||
"164| 114 бригада\n",
|
||||
"\n",
|
||||
"165| 114 военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"166|114 гвардейская отдельная мотострелковая бригада\n",
|
||||
"\n",
|
||||
"114 гвардейский мотострелковый полк\n",
|
||||
"\n",
|
||||
"168\n",
|
||||
"\n",
|
||||
"114 отделение территориальное\n",
|
||||
"\n",
|
||||
"169\n",
|
||||
"\n",
|
||||
"40 гвардейский артиллерийский полк\n",
|
||||
"\n",
|
||||
"170\n",
|
||||
"\n",
|
||||
"41 гвардейский артиллерийский полк\n",
|
||||
"\n",
|
||||
"171\n",
|
||||
"\n",
|
||||
"1142 военное представительство Министерства обороны Российской Федерации\n",
|
||||
"\n",
|
||||
"172\n",
|
||||
"\n",
|
||||
"1143 отдельный зенитный ракетный дивизион\n",
|
||||
"\n",
|
||||
"173\n",
|
||||
"\n",
|
||||
"115 военная автомобильная инспекция\n",
|
||||
"\n",
|
||||
"174\n",
|
||||
"\n",
|
||||
"5 государственный специальный химический арсенал\n",
|
||||
"\n",
|
||||
"175\n",
|
||||
"\n",
|
||||
"150 радиоэлектронный центр\n",
|
||||
"\n",
|
||||
"176\n",
|
||||
"\n",
|
||||
"177\n",
|
||||
"178\n",
|
||||
"\n",
|
||||
"1152 мотострелковый полк\n",
|
||||
"1153 мотострелковый полк\n",
|
||||
"54 мотострелковый полк\n",
|
||||
"\n",
|
||||
"179\n",
|
||||
"\n",
|
||||
"1155 центр\n",
|
||||
"\n",
|
||||
"180\n",
|
||||
"\n",
|
||||
"157 пожарная команда\n",
|
||||
"\n",
|
||||
"ТТТ.\n",
|
||||
"\n",
|
||||
"181\n",
|
||||
"\n",
|
||||
"1158 пожарная команда.\n",
|
||||
"\n",
|
||||
"182\n",
|
||||
"\n",
|
||||
"1159 военное представительство Министерства обороны Российской Федерации\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 81
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"source": "",
|
||||
"id": "7e909e7dc99c7f4a"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
1510
poetry.lock
generated
1510
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -136,7 +136,10 @@ class SimpleIngestComponent(BaseIngestComponentWithIndex):
|
||||
return saved_documents
|
||||
|
||||
def _save_docs(self, documents: list[Document]) -> list[Document]:
|
||||
logger.debug("Transforming count=%s documents into nodes", len(documents))
|
||||
logger.debug(
|
||||
"Transforming count=%s documents into nodes",
|
||||
len(documents) if isinstance(documents, list) else 1,
|
||||
)
|
||||
with self._index_thread_lock:
|
||||
for document in documents:
|
||||
self._index.insert(document, show_progress=True)
|
||||
|
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
import tempfile
|
||||
from llama_index.core.readers import StringIterableReader
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.readers.json import JSONReader
|
||||
@@ -44,8 +46,8 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
||||
|
||||
default_file_reader_cls: dict[str, type[BaseReader]] = {
|
||||
".hwp": HWPReader,
|
||||
# ".pdf": simple_pdf_extractor if simple_pdf_extractor else PDFReader,
|
||||
".pdf": PDFReader,
|
||||
# ".pdf": PDFReader,
|
||||
".pdf": SmartPDFLoader if LLMSHERPA_API_URL else PDFReader,
|
||||
".docx": SmartPDFLoader if LLMSHERPA_API_URL else DocxReader,
|
||||
".pptx": PptxReader,
|
||||
".ppt": PptxReader,
|
||||
@@ -70,7 +72,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
||||
FILE_READER_CLS = _try_loading_included_file_formats()
|
||||
FILE_READER_CLS.update(
|
||||
{
|
||||
".json": JSONReader(),
|
||||
".json": JSONReader,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -88,7 +90,7 @@ class IngestionHelper:
|
||||
) -> list[Document]:
|
||||
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
|
||||
for document in documents:
|
||||
document.metadata["file_name"] = file_name
|
||||
document.metadata["file_name"] = file_data.as_posix()
|
||||
IngestionHelper._exclude_metadata(documents)
|
||||
return documents
|
||||
|
||||
@@ -108,16 +110,44 @@ class IngestionHelper:
|
||||
return string_reader.load_data([file_data.read_text()])
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading file as plain text: {e}")
|
||||
raise ValueError(
|
||||
f"No reader found for extension={extension}, file_name={file_name}"
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Specific reader found for extension=%s, {reader_cls=}", extension
|
||||
)
|
||||
if reader_cls.__name__ == "SmartPDFLoader":
|
||||
return reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
|
||||
documents = reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
|
||||
file_data.as_posix()
|
||||
)
|
||||
else:
|
||||
return reader_cls().load_data(file_data)
|
||||
documents = reader_cls().load_data(file_data)
|
||||
|
||||
if len(documents) == 0 and extension == ".pdf":
|
||||
logger.debug(
|
||||
"No text extracted from PDF, trying to extract images from PDF"
|
||||
)
|
||||
try:
|
||||
import pdf2image
|
||||
import pytesseract
|
||||
|
||||
images = pdf2image.convert_from_path(file_data)
|
||||
documents = []
|
||||
|
||||
for i, image in tqdm.tqdm(enumerate(images)):
|
||||
text = pytesseract.image_to_string(image, lang="rus")
|
||||
doc = StringIterableReader().load_data(
|
||||
[text],
|
||||
)[0]
|
||||
doc.metadata["page_label"] = i
|
||||
|
||||
documents.extend(doc)
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting images from PDF: {e}")
|
||||
raise ValueError(f"No text extracted from PDF={file_name}")
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _exclude_metadata(documents: list[Document]) -> None:
|
||||
@@ -127,4 +157,10 @@ class IngestionHelper:
|
||||
# We don't want the Embeddings search to receive this metadata
|
||||
document.excluded_embed_metadata_keys = ["doc_id"]
|
||||
# We don't want the LLM to receive these metadata in the context
|
||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
|
||||
# ToDo currently remove file_name
|
||||
document.excluded_llm_metadata_keys = [
|
||||
# "file_name",
|
||||
"doc_id",
|
||||
"page_label",
|
||||
]
|
||||
document.excluded_llm_metadata_keys = ["doc_id", "page_label"]
|
||||
|
@@ -70,6 +70,10 @@ einops = {version = "^0.8.0", optional = true}
|
||||
llama-index-readers-smart-pdf-loader = "^0.1.4"
|
||||
python-pptx = "^1.0.2"
|
||||
pillow = "^10.4.0"
|
||||
jupyter = "^1.1.1"
|
||||
pypdf2 = "^3.0.1"
|
||||
pytesseract = "^0.3.13"
|
||||
pdf2image = "^1.17.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
ui = ["gradio", "ffmpy"]
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
#!/usr/bin/env pythfon3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
Reference in New Issue
Block a user