Ingestion Speedup Multiple strategy (#1309)

2025-09-08 10:39:33 +00:00 · 2023-11-25 20:12:09 +01:00
parent 546ba33e6f
commit bafdd3baf1
13 changed files with 515 additions and 195 deletions
--- a/private_gpt/components/ingest/ingest_helper.py
+++ b/private_gpt/components/ingest/ingest_helper.py
@@ -0,0 +1,61 @@
+import logging
+from pathlib import Path
+
+from llama_index import Document
+from llama_index.readers import JSONReader, StringIterableReader
+from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
+
+logger = logging.getLogger(__name__)
+
+# Patching the default file reader to support other file types
+FILE_READER_CLS = DEFAULT_FILE_READER_CLS.copy()
+FILE_READER_CLS.update(
+    {
+        ".json": JSONReader,
+    }
+)
+
+
+class IngestionHelper:
+    """Helper class to transform a file into a list of documents.
+
+    This class should be used to transform a file into a list of documents.
+    These methods are thread-safe (and multiprocessing-safe).
+    """
+
+    @staticmethod
+    def transform_file_into_documents(
+        file_name: str, file_data: Path
+    ) -> list[Document]:
+        documents = IngestionHelper._load_file_to_documents(file_name, file_data)
+        for document in documents:
+            document.metadata["file_name"] = file_name
+        IngestionHelper._exclude_metadata(documents)
+        return documents
+
+    @staticmethod
+    def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
+        logger.debug("Transforming file_name=%s into documents", file_name)
+        extension = Path(file_name).suffix
+        reader_cls = FILE_READER_CLS.get(extension)
+        if reader_cls is None:
+            logger.debug(
+                "No reader found for extension=%s, using default string reader",
+                extension,
+            )
+            # Read as a plain text
+            string_reader = StringIterableReader()
+            return string_reader.load_data([file_data.read_text()])
+
+        logger.debug("Specific reader found for extension=%s", extension)
+        return reader_cls().load_data(file_data)
+
+    @staticmethod
+    def _exclude_metadata(documents: list[Document]) -> None:
+        logger.debug("Excluding metadata from count=%s documents", len(documents))
+        for document in documents:
+            document.metadata["doc_id"] = document.doc_id
+            # We don't want the Embeddings search to receive this metadata
+            document.excluded_embed_metadata_keys = ["doc_id"]
+            # We don't want the LLM to receive these metadata in the context
+            document.excluded_llm_metadata_keys = ["file_name", "doc_id", "page_label"]