mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-03 08:15:14 +00:00
[dev] refactor, create CustomImagePagePdfReader
This commit is contained in:
@@ -8,6 +8,8 @@ from llama_index.core.readers.base import BaseReader
|
|||||||
from llama_index.core.readers.json import JSONReader
|
from llama_index.core.readers.json import JSONReader
|
||||||
from llama_index.core.schema import Document
|
from llama_index.core.schema import Document
|
||||||
|
|
||||||
|
from scripts.readers import CustomImagePagePdfReader
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
LLMSHERPA_API_URL = (
|
LLMSHERPA_API_URL = (
|
||||||
@@ -129,24 +131,14 @@ class IngestionHelper:
|
|||||||
"No text extracted from PDF, trying to extract images from PDF"
|
"No text extracted from PDF, trying to extract images from PDF"
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
import pdf2image
|
pdf_reader = CustomImagePagePdfReader(lang="rus")
|
||||||
import pytesseract
|
documents = pdf_reader.load_data(file_data.as_posix())
|
||||||
|
|
||||||
images = pdf2image.convert_from_path(file_data)
|
|
||||||
documents = []
|
|
||||||
|
|
||||||
for i, image in tqdm.tqdm(enumerate(images)):
|
|
||||||
text = pytesseract.image_to_string(image, lang="rus")
|
|
||||||
doc = StringIterableReader().load_data(
|
|
||||||
[text],
|
|
||||||
)
|
|
||||||
# )[0]
|
|
||||||
# doc.metadata["page_label"] = str(i + 1)
|
|
||||||
|
|
||||||
documents.extend(doc)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error extracting images from PDF: {e}")
|
logger.error(f"Error extracting images from PDF: {e}")
|
||||||
raise ValueError(f"No text extracted from PDF={file_name}")
|
raise ValueError(f"No text extracted from PDF: {file_name}")
|
||||||
|
|
||||||
|
if len(documents) == 0:
|
||||||
|
logger.warning(f"No documents extracted from file: {file_name}")
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
@@ -158,7 +150,6 @@ class IngestionHelper:
|
|||||||
# We don't want the Embeddings search to receive this metadata
|
# We don't want the Embeddings search to receive this metadata
|
||||||
document.excluded_embed_metadata_keys = ["doc_id"]
|
document.excluded_embed_metadata_keys = ["doc_id"]
|
||||||
# We don't want the LLM to receive these metadata in the context
|
# We don't want the LLM to receive these metadata in the context
|
||||||
# ToDo currently remove file_name
|
|
||||||
document.excluded_llm_metadata_keys = [
|
document.excluded_llm_metadata_keys = [
|
||||||
# "file_name",
|
# "file_name",
|
||||||
"doc_id",
|
"doc_id",
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
"""This file should be imported if and only if you want to run the UI locally."""
|
"""This file should be imported if and only if you want to run the UI locally."""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
@@ -69,7 +70,8 @@ class Source(BaseModel):
|
|||||||
file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-"
|
file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-"
|
||||||
page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-"
|
page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-"
|
||||||
|
|
||||||
source = Source(file=file_name, page=page_label, text=chunk.text)
|
logger.debug("Source: %s %s", file_name, page_label)
|
||||||
|
source = Source(file=file_name, page=str(page_label), text=chunk.text)
|
||||||
curated_sources.append(source)
|
curated_sources.append(source)
|
||||||
curated_sources = list(
|
curated_sources = list(
|
||||||
dict.fromkeys(curated_sources).keys()
|
dict.fromkeys(curated_sources).keys()
|
||||||
|
43
scripts/readers.py
Normal file
43
scripts/readers.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import logging
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
from llama_index.core.readers.base import BaseReader
|
||||||
|
from llama_index.core.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CustomImagePagePdfReader(BaseReader):
|
||||||
|
def __init__(self, *args: Any, lang: str = "rus", **kwargs: Any) -> None:
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, pdf_path: str, extra_info: Optional[Dict] = None
|
||||||
|
) -> List[Document]:
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pdf2image
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("You need to install `pdf2image` to use this reader")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("You need to install `pytesseract` to use this reader")
|
||||||
|
|
||||||
|
images = pdf2image.convert_from_path(pdf_path)
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
for i, image in tqdm.tqdm(enumerate(images)):
|
||||||
|
text = pytesseract.image_to_string(image, lang=self.lang)
|
||||||
|
doc = Document(
|
||||||
|
text=text,
|
||||||
|
extra_info={"chunk_type": "image", "page_label": i + 1},
|
||||||
|
)
|
||||||
|
documents.append(doc)
|
||||||
|
|
||||||
|
return documents
|
Reference in New Issue
Block a user