[dev] refactor, create CustomImagePagePdfReader

This commit is contained in:
zoazhyga
2024-09-06 10:24:20 +02:00
parent be204cf8bd
commit 800127f3ba
3 changed files with 54 additions and 18 deletions

View File

@@ -8,6 +8,8 @@ from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.json import JSONReader from llama_index.core.readers.json import JSONReader
from llama_index.core.schema import Document from llama_index.core.schema import Document
from scripts.readers import CustomImagePagePdfReader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
LLMSHERPA_API_URL = ( LLMSHERPA_API_URL = (
@@ -129,24 +131,14 @@ class IngestionHelper:
"No text extracted from PDF, trying to extract images from PDF" "No text extracted from PDF, trying to extract images from PDF"
) )
try: try:
import pdf2image pdf_reader = CustomImagePagePdfReader(lang="rus")
import pytesseract documents = pdf_reader.load_data(file_data.as_posix())
images = pdf2image.convert_from_path(file_data)
documents = []
for i, image in tqdm.tqdm(enumerate(images)):
text = pytesseract.image_to_string(image, lang="rus")
doc = StringIterableReader().load_data(
[text],
)
# )[0]
# doc.metadata["page_label"] = str(i + 1)
documents.extend(doc)
except Exception as e: except Exception as e:
logger.error(f"Error extracting images from PDF: {e}") logger.error(f"Error extracting images from PDF: {e}")
raise ValueError(f"No text extracted from PDF={file_name}") raise ValueError(f"No text extracted from PDF: {file_name}")
if len(documents) == 0:
logger.warning(f"No documents extracted from file: {file_name}")
return documents return documents
@@ -158,7 +150,6 @@ class IngestionHelper:
# We don't want the Embeddings search to receive this metadata # We don't want the Embeddings search to receive this metadata
document.excluded_embed_metadata_keys = ["doc_id"] document.excluded_embed_metadata_keys = ["doc_id"]
# We don't want the LLM to receive these metadata in the context # We don't want the LLM to receive these metadata in the context
# ToDo currently remove file_name
document.excluded_llm_metadata_keys = [ document.excluded_llm_metadata_keys = [
# "file_name", # "file_name",
"doc_id", "doc_id",

View File

@@ -1,4 +1,5 @@
"""This file should be imported if and only if you want to run the UI locally.""" """This file should be imported if and only if you want to run the UI locally."""
import base64 import base64
import logging import logging
import time import time
@@ -69,7 +70,8 @@ class Source(BaseModel):
file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-" file_name = doc_metadata.get("file_name", "-") if doc_metadata else "-"
page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-" page_label = doc_metadata.get("page_label", "-") if doc_metadata else "-"
source = Source(file=file_name, page=page_label, text=chunk.text) logger.debug("Source: %s %s", file_name, page_label)
source = Source(file=file_name, page=str(page_label), text=chunk.text)
curated_sources.append(source) curated_sources.append(source)
curated_sources = list( curated_sources = list(
dict.fromkeys(curated_sources).keys() dict.fromkeys(curated_sources).keys()

43
scripts/readers.py Normal file
View File

@@ -0,0 +1,43 @@
import logging
from typing import Any, Dict, List, Optional
import tqdm
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
logger = logging.getLogger(__name__)
class CustomImagePagePdfReader(BaseReader):
def __init__(self, *args: Any, lang: str = "rus", **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.lang = lang
def load_data(
self, pdf_path: str, extra_info: Optional[Dict] = None
) -> List[Document]:
try:
import pdf2image
except ImportError:
raise ImportError("You need to install `pdf2image` to use this reader")
try:
import pytesseract
except ImportError:
raise ImportError("You need to install `pytesseract` to use this reader")
images = pdf2image.convert_from_path(pdf_path)
documents = []
for i, image in tqdm.tqdm(enumerate(images)):
text = pytesseract.image_to_string(image, lang=self.lang)
doc = Document(
text=text,
extra_info={"chunk_type": "image", "page_label": i + 1},
)
documents.append(doc)
return documents