From ee8aa54f53d1c5dedb52547d32b85e1a26670735 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Sat, 6 Jul 2024 00:54:17 +0530 Subject: [PATCH] community[patch]: Fix source path mismatch in PebbloSafeLoader (#23857) **Description:** Fix for source path mismatch in PebbloSafeLoader. The fix involves storing the full path in the doc metadata in VectorDB **Issue:** NA, caught in internal testing **Dependencies:** NA **Add tests**: Updated tests --- .../langchain_community/document_loaders/pebblo.py | 14 ++++++++++++++ .../unit_tests/document_loaders/test_pebblo.py | 5 +++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index f9bde4cc44e..a695582e2fa 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -89,6 +89,8 @@ class PebbloSafeLoader(BaseLoader): list: Documents fetched from load method of the wrapped `loader`. """ self.docs = self.loader.load() + # Add pebblo-specific metadata to docs + self._add_pebblo_specific_metadata() if not self.load_semantic: self._classify_doc(self.docs, loading_end=True) return self.docs @@ -123,6 +125,8 @@ class PebbloSafeLoader(BaseLoader): self.docs = [] break self.docs = list((doc,)) + # Add pebblo-specific metadata to docs + self._add_pebblo_specific_metadata() if not self.load_semantic: self._classify_doc(self.docs, loading_end=True) yield self.docs[0] @@ -517,3 +521,13 @@ class PebbloSafeLoader(BaseLoader): classified_doc.get("topics", {}).keys() ) return doc + + def _add_pebblo_specific_metadata(self) -> None: + """Add Pebblo specific metadata to documents.""" + for doc in self.docs: + doc_metadata = doc.metadata + doc_metadata["full_path"] = get_full_path( + doc_metadata.get( + "full_path", doc_metadata.get("source", self.source_path) + ) + ) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py index a98f4712287..1cee8a849d1 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -62,14 +62,15 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: post=MockResponse(json_data={"data": ""}, status_code=200), ) file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv") + full_file_path = os.path.abspath(file_path) expected_docs = [ Document( page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", - metadata={"source": file_path, "row": 0}, + metadata={"source": file_path, "row": 0, "full_path": full_file_path}, ), Document( page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", - metadata={"source": file_path, "row": 1}, + metadata={"source": file_path, "row": 1, "full_path": full_file_path}, ), ]