[fix] fix calling readers during runtime

This commit is contained in:
zoazhyga
2024-08-12 17:07:53 +02:00
parent b4149c8d87
commit 00b64b9a4a
2 changed files with 37 additions and 35 deletions

View File

@@ -8,20 +8,16 @@ from llama_index.core.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
LLMSHERPA_API_URL = (
"http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
)
# Inspired by the `llama_index.core.readers.file.base` module # Inspired by the `llama_index.core.readers.file.base` module
def _try_loading_included_file_formats( def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
llmsherpa_api_url: str = None, if LLMSHERPA_API_URL is not None:
) -> dict[str, type[BaseReader]]:
simple_pdf_extractor = None
if llmsherpa_api_url is not None:
try: try:
from llama_index.readers.smart_pdf_loader import SmartPDFLoader from llama_index.readers.smart_pdf_loader import SmartPDFLoader
# llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
simple_pdf_extractor = SmartPDFLoader(
llmsherpa_api_url=llmsherpa_api_url,
)
except ImportError as e: except ImportError as e:
raise ImportError( raise ImportError(
"`llama-index-readers-smart-pdf-loader` package not found" "`llama-index-readers-smart-pdf-loader` package not found"
@@ -47,33 +43,31 @@ def _try_loading_included_file_formats(
raise ImportError("`llama-index-readers-file` package not found") from e raise ImportError("`llama-index-readers-file` package not found") from e
default_file_reader_cls: dict[str, type[BaseReader]] = { default_file_reader_cls: dict[str, type[BaseReader]] = {
".hwp": HWPReader(), ".hwp": HWPReader,
# ".pdf": simple_pdf_extractor if simple_pdf_extractor else PDFReader, # ".pdf": simple_pdf_extractor if simple_pdf_extractor else PDFReader,
".pdf": PDFReader(), ".pdf": PDFReader,
".docx": simple_pdf_extractor if simple_pdf_extractor else DocxReader(), ".docx": SmartPDFLoader if LLMSHERPA_API_URL else DocxReader,
".pptx": PptxReader(), ".pptx": PptxReader,
".ppt": PptxReader(), ".ppt": PptxReader,
".pptm": PptxReader(), ".pptm": PptxReader,
".jpg": ImageReader(), ".jpg": ImageReader,
".png": ImageReader(), ".png": ImageReader,
".jpeg": ImageReader(), ".jpeg": ImageReader,
# ".mp3": VideoAudioReader(), # ".mp3": VideoAudioReader,
# ".mp4": VideoAudioReader(), # ".mp4": VideoAudioReader,
".csv": simple_pdf_extractor if simple_pdf_extractor else PandasCSVReader(), ".csv": SmartPDFLoader if LLMSHERPA_API_URL else PandasCSVReader,
".xls": simple_pdf_extractor if simple_pdf_extractor else None, ".xls": SmartPDFLoader if LLMSHERPA_API_URL else None,
".xlsx": simple_pdf_extractor if simple_pdf_extractor else None, ".xlsx": SmartPDFLoader if LLMSHERPA_API_URL else None,
".epub": EpubReader(), ".epub": EpubReader,
".md": MarkdownReader(), ".md": MarkdownReader,
".mbox": MboxReader(), ".mbox": MboxReader,
".ipynb": IPYNBReader(), ".ipynb": IPYNBReader,
} }
return default_file_reader_cls return default_file_reader_cls
# Patching the default file reader to support other file types # Patching the default file reader to support other file types
FILE_READER_CLS = _try_loading_included_file_formats( FILE_READER_CLS = _try_loading_included_file_formats()
"http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
)
FILE_READER_CLS.update( FILE_READER_CLS.update(
{ {
".json": JSONReader(), ".json": JSONReader(),
@@ -109,13 +103,21 @@ class IngestionHelper:
extension, extension,
) )
# Read as a plain text # Read as a plain text
string_reader = StringIterableReader() try:
return string_reader.load_data([file_data.read_text()]) string_reader = StringIterableReader()
return string_reader.load_data([file_data.read_text()])
except Exception as e:
logger.error(f"Error reading file as plain text: {e}")
logger.debug( logger.debug(
f"Specific reader found for extension=%s, {reader_cls=}", extension f"Specific reader found for extension=%s, {reader_cls=}", extension
) )
return reader_cls.load_data(file_data.as_posix()) if reader_cls.__name__ == "SmartPDFLoader":
return reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
file_data.as_posix()
)
else:
return reader_cls().load_data(file_data)
@staticmethod @staticmethod
def _exclude_metadata(documents: list[Document]) -> None: def _exclude_metadata(documents: list[Document]) -> None:

View File

@@ -2,7 +2,7 @@ server:
env_name: ${APP_ENV:huglama} env_name: ${APP_ENV:huglama}
data: data:
local_data_folder: local_data/private_gpt # local_data_folder: local_data/private_gpt
local_ingestion: local_ingestion:
enabled: ${LOCAL_INGESTION_ENABLED:true} enabled: ${LOCAL_INGESTION_ENABLED:true}
allow_ingest_from: ["local_data/input_raw"] allow_ingest_from: ["local_data/input_raw"]