mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-06 09:41:31 +00:00
[dev] add support for all image pages inside pdf
This commit is contained in:
@@ -2,15 +2,246 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
|
||||||
"id": "initial_id",
|
"id": "initial_id",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": true,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:03:18.404518Z",
|
||||||
|
"start_time": "2024-09-05T14:03:18.401535Z"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
"source": "# Read PDF",
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"execution_count": 33
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:39:16.282184Z",
|
||||||
|
"start_time": "2024-09-05T14:39:16.279186Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
"source": [
|
"source": [
|
||||||
""
|
"import pdf2image\n",
|
||||||
]
|
"import pytesseract\n",
|
||||||
|
"from pytesseract import Output, TesseractError"
|
||||||
|
],
|
||||||
|
"id": "ad1ffea2dcb7dcaf",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 74
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:39:16.648563Z",
|
||||||
|
"start_time": "2024-09-05T14:39:16.646494Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": "file = \"../local_data/input_raw/test/26223.pdf\"",
|
||||||
|
"id": "f38556f4ef09d669",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:39:23.492339Z",
|
||||||
|
"start_time": "2024-09-05T14:39:18.280277Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": "images = pdf2image.convert_from_path(file)",
|
||||||
|
"id": "67286a6f741debb0",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 76
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:39:38.005863Z",
|
||||||
|
"start_time": "2024-09-05T14:39:36.119195Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"pil_im = images[5] # assuming that we're interested in the first page only\n",
|
||||||
|
"\n",
|
||||||
|
"ocr_dict = pytesseract.image_to_string(pil_im, lang=\"rus\")"
|
||||||
|
],
|
||||||
|
"id": "ec339f7da13fc37f",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 79
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-09-05T14:39:49.348914Z",
|
||||||
|
"start_time": "2024-09-05T14:39:49.344588Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": "print(ocr_dict)",
|
||||||
|
"id": "df86f1ed6f5f1b6e",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"096 отдельный батальон материального обеспечения\n",
|
||||||
|
"\n",
|
||||||
|
"1099 мотострелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"| береговая ракетно-артиллерийская бригада\n",
|
||||||
|
"\n",
|
||||||
|
"военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"гвардейская отдельная десантно-штурмовая бригада\n",
|
||||||
|
"гвардейская отдельная инженерная бригада\n",
|
||||||
|
"отдельная вертолетная эскадрилья\n",
|
||||||
|
"отдельная танковая бригада\n",
|
||||||
|
"отдельный медицинский батальон\n",
|
||||||
|
"отдельный танковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"полк радиационной, химической и биологической защиты\n",
|
||||||
|
"\n",
|
||||||
|
"смешанный авиационный полк\n",
|
||||||
|
"средняя общеобразовательная школа\n",
|
||||||
|
"\n",
|
||||||
|
"центральный узел контроля безопасности связи\n",
|
||||||
|
"\n",
|
||||||
|
"11 экипаж большой подводной лодки\n",
|
||||||
|
"\n",
|
||||||
|
"110 военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"110 военное представительство Министерства обороны Российской Федерации\n",
|
||||||
|
"\n",
|
||||||
|
"110 отдельная мотострелковая бригада\n",
|
||||||
|
"110 отдельный стрелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"1101 отдел государственного технического надзора\n",
|
||||||
|
"\n",
|
||||||
|
"1102 мотострелковый полк\n",
|
||||||
|
"1104 мотострелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"144\n",
|
||||||
|
"\n",
|
||||||
|
"1105 мотострелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"109 отдельный оптико-электронный узел\n",
|
||||||
|
"\n",
|
||||||
|
"11 военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"П главный государственный центр судеоно-медицинских и криминалистических экспертиз Министерства\n",
|
||||||
|
"\n",
|
||||||
|
"|| отдельный стрелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"11 центральная база резерва танков\n",
|
||||||
|
"\n",
|
||||||
|
"110 объединенное управление эксплуатации специальных объектов\n",
|
||||||
|
"\n",
|
||||||
|
"117 зенитный ракетный полк\n",
|
||||||
|
"\n",
|
||||||
|
"152|1118 военное представительство Министерства обороны Российской Федерации\n",
|
||||||
|
"153|1118 отдельный радиолокационный узел\n",
|
||||||
|
"\n",
|
||||||
|
"154|112 авиационный полигон\n",
|
||||||
|
"\n",
|
||||||
|
"155112 гвардейская ракетная бригада\n",
|
||||||
|
"\n",
|
||||||
|
"156| 112 отдельный вертолетный полк\n",
|
||||||
|
"\n",
|
||||||
|
"157|112 отдельный стрелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"158|1122 отдельный батальон материального обеспечения\n",
|
||||||
|
"159|1124 отдельный батальон материального обеспечения\n",
|
||||||
|
"160|1127 ремонтный завод ракетно-артиллерийского вооружения\n",
|
||||||
|
"161|113 военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"162|1139 отдельный батальон материального обеспечения\n",
|
||||||
|
"163|1139 отдельный измерительный пункт\n",
|
||||||
|
"\n",
|
||||||
|
"164| 114 бригада\n",
|
||||||
|
"\n",
|
||||||
|
"165| 114 военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"166|114 гвардейская отдельная мотострелковая бригада\n",
|
||||||
|
"\n",
|
||||||
|
"114 гвардейский мотострелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"168\n",
|
||||||
|
"\n",
|
||||||
|
"114 отделение территориальное\n",
|
||||||
|
"\n",
|
||||||
|
"169\n",
|
||||||
|
"\n",
|
||||||
|
"40 гвардейский артиллерийский полк\n",
|
||||||
|
"\n",
|
||||||
|
"170\n",
|
||||||
|
"\n",
|
||||||
|
"41 гвардейский артиллерийский полк\n",
|
||||||
|
"\n",
|
||||||
|
"171\n",
|
||||||
|
"\n",
|
||||||
|
"1142 военное представительство Министерства обороны Российской Федерации\n",
|
||||||
|
"\n",
|
||||||
|
"172\n",
|
||||||
|
"\n",
|
||||||
|
"1143 отдельный зенитный ракетный дивизион\n",
|
||||||
|
"\n",
|
||||||
|
"173\n",
|
||||||
|
"\n",
|
||||||
|
"115 военная автомобильная инспекция\n",
|
||||||
|
"\n",
|
||||||
|
"174\n",
|
||||||
|
"\n",
|
||||||
|
"5 государственный специальный химический арсенал\n",
|
||||||
|
"\n",
|
||||||
|
"175\n",
|
||||||
|
"\n",
|
||||||
|
"150 радиоэлектронный центр\n",
|
||||||
|
"\n",
|
||||||
|
"176\n",
|
||||||
|
"\n",
|
||||||
|
"177\n",
|
||||||
|
"178\n",
|
||||||
|
"\n",
|
||||||
|
"1152 мотострелковый полк\n",
|
||||||
|
"1153 мотострелковый полк\n",
|
||||||
|
"54 мотострелковый полк\n",
|
||||||
|
"\n",
|
||||||
|
"179\n",
|
||||||
|
"\n",
|
||||||
|
"1155 центр\n",
|
||||||
|
"\n",
|
||||||
|
"180\n",
|
||||||
|
"\n",
|
||||||
|
"157 пожарная команда\n",
|
||||||
|
"\n",
|
||||||
|
"ТТТ.\n",
|
||||||
|
"\n",
|
||||||
|
"181\n",
|
||||||
|
"\n",
|
||||||
|
"1158 пожарная команда.\n",
|
||||||
|
"\n",
|
||||||
|
"182\n",
|
||||||
|
"\n",
|
||||||
|
"1159 военное представительство Министерства обороны Российской Федерации\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 81
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": "",
|
||||||
|
"id": "7e909e7dc99c7f4a"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
1510
poetry.lock
generated
1510
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -136,7 +136,10 @@ class SimpleIngestComponent(BaseIngestComponentWithIndex):
|
|||||||
return saved_documents
|
return saved_documents
|
||||||
|
|
||||||
def _save_docs(self, documents: list[Document]) -> list[Document]:
|
def _save_docs(self, documents: list[Document]) -> list[Document]:
|
||||||
logger.debug("Transforming count=%s documents into nodes", len(documents))
|
logger.debug(
|
||||||
|
"Transforming count=%s documents into nodes",
|
||||||
|
len(documents) if isinstance(documents, list) else 1,
|
||||||
|
)
|
||||||
with self._index_thread_lock:
|
with self._index_thread_lock:
|
||||||
for document in documents:
|
for document in documents:
|
||||||
self._index.insert(document, show_progress=True)
|
self._index.insert(document, show_progress=True)
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import tqdm
|
||||||
|
import tempfile
|
||||||
from llama_index.core.readers import StringIterableReader
|
from llama_index.core.readers import StringIterableReader
|
||||||
from llama_index.core.readers.base import BaseReader
|
from llama_index.core.readers.base import BaseReader
|
||||||
from llama_index.core.readers.json import JSONReader
|
from llama_index.core.readers.json import JSONReader
|
||||||
@@ -44,8 +46,8 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||||||
|
|
||||||
default_file_reader_cls: dict[str, type[BaseReader]] = {
|
default_file_reader_cls: dict[str, type[BaseReader]] = {
|
||||||
".hwp": HWPReader,
|
".hwp": HWPReader,
|
||||||
# ".pdf": simple_pdf_extractor if simple_pdf_extractor else PDFReader,
|
# ".pdf": PDFReader,
|
||||||
".pdf": PDFReader,
|
".pdf": SmartPDFLoader if LLMSHERPA_API_URL else PDFReader,
|
||||||
".docx": SmartPDFLoader if LLMSHERPA_API_URL else DocxReader,
|
".docx": SmartPDFLoader if LLMSHERPA_API_URL else DocxReader,
|
||||||
".pptx": PptxReader,
|
".pptx": PptxReader,
|
||||||
".ppt": PptxReader,
|
".ppt": PptxReader,
|
||||||
@@ -70,7 +72,7 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
|
|||||||
FILE_READER_CLS = _try_loading_included_file_formats()
|
FILE_READER_CLS = _try_loading_included_file_formats()
|
||||||
FILE_READER_CLS.update(
|
FILE_READER_CLS.update(
|
||||||
{
|
{
|
||||||
".json": JSONReader(),
|
".json": JSONReader,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -88,7 +90,7 @@ class IngestionHelper:
|
|||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
|
documents = IngestionHelper._load_file_to_documents(file_name, file_data)
|
||||||
for document in documents:
|
for document in documents:
|
||||||
document.metadata["file_name"] = file_name
|
document.metadata["file_name"] = file_data.as_posix()
|
||||||
IngestionHelper._exclude_metadata(documents)
|
IngestionHelper._exclude_metadata(documents)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
@@ -108,16 +110,44 @@ class IngestionHelper:
|
|||||||
return string_reader.load_data([file_data.read_text()])
|
return string_reader.load_data([file_data.read_text()])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error reading file as plain text: {e}")
|
logger.error(f"Error reading file as plain text: {e}")
|
||||||
|
raise ValueError(
|
||||||
|
f"No reader found for extension={extension}, file_name={file_name}"
|
||||||
|
)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Specific reader found for extension=%s, {reader_cls=}", extension
|
f"Specific reader found for extension=%s, {reader_cls=}", extension
|
||||||
)
|
)
|
||||||
if reader_cls.__name__ == "SmartPDFLoader":
|
if reader_cls.__name__ == "SmartPDFLoader":
|
||||||
return reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
|
documents = reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
|
||||||
file_data.as_posix()
|
file_data.as_posix()
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return reader_cls().load_data(file_data)
|
documents = reader_cls().load_data(file_data)
|
||||||
|
|
||||||
|
if len(documents) == 0 and extension == ".pdf":
|
||||||
|
logger.debug(
|
||||||
|
"No text extracted from PDF, trying to extract images from PDF"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
import pdf2image
|
||||||
|
import pytesseract
|
||||||
|
|
||||||
|
images = pdf2image.convert_from_path(file_data)
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
for i, image in tqdm.tqdm(enumerate(images)):
|
||||||
|
text = pytesseract.image_to_string(image, lang="rus")
|
||||||
|
doc = StringIterableReader().load_data(
|
||||||
|
[text],
|
||||||
|
)[0]
|
||||||
|
doc.metadata["page_label"] = i
|
||||||
|
|
||||||
|
documents.extend(doc)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting images from PDF: {e}")
|
||||||
|
raise ValueError(f"No text extracted from PDF={file_name}")
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _exclude_metadata(documents: list[Document]) -> None:
|
def _exclude_metadata(documents: list[Document]) -> None:
|
||||||
@@ -127,4 +157,10 @@ class IngestionHelper:
|
|||||||
# We don't want the Embeddings search to receive this metadata
|
# We don't want the Embeddings search to receive this metadata
|
||||||
document.excluded_embed_metadata_keys = ["doc_id"]
|
document.excluded_embed_metadata_keys = ["doc_id"]
|
||||||
# We don't want the LLM to receive these metadata in the context
|
# We don't want the LLM to receive these metadata in the context
|
||||||
document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]
|
# ToDo currently remove file_name
|
||||||
|
document.excluded_llm_metadata_keys = [
|
||||||
|
# "file_name",
|
||||||
|
"doc_id",
|
||||||
|
"page_label",
|
||||||
|
]
|
||||||
|
document.excluded_llm_metadata_keys = ["doc_id", "page_label"]
|
||||||
|
@@ -70,6 +70,10 @@ einops = {version = "^0.8.0", optional = true}
|
|||||||
llama-index-readers-smart-pdf-loader = "^0.1.4"
|
llama-index-readers-smart-pdf-loader = "^0.1.4"
|
||||||
python-pptx = "^1.0.2"
|
python-pptx = "^1.0.2"
|
||||||
pillow = "^10.4.0"
|
pillow = "^10.4.0"
|
||||||
|
jupyter = "^1.1.1"
|
||||||
|
pypdf2 = "^3.0.1"
|
||||||
|
pytesseract = "^0.3.13"
|
||||||
|
pdf2image = "^1.17.0"
|
||||||
|
|
||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
ui = ["gradio", "ffmpy"]
|
ui = ["gradio", "ffmpy"]
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env pythfon3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
Reference in New Issue
Block a user