community[patch]: Fix source path mismatch in PebbloSafeLoader (#23857)

**Description:** Fix for source path mismatch in PebbloSafeLoader. The
fix involves storing the full path in the doc metadata in VectorDB
**Issue:** NA, caught in internal testing
**Dependencies:** NA
**Add tests**:  Updated tests
This commit is contained in:
Rajendra Kadam 2024-07-06 00:54:17 +05:30 committed by GitHub
parent 5b7d5f7729
commit ee8aa54f53
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 2 deletions

View File

@ -89,6 +89,8 @@ class PebbloSafeLoader(BaseLoader):
list: Documents fetched from load method of the wrapped `loader`. list: Documents fetched from load method of the wrapped `loader`.
""" """
self.docs = self.loader.load() self.docs = self.loader.load()
# Add pebblo-specific metadata to docs
self._add_pebblo_specific_metadata()
if not self.load_semantic: if not self.load_semantic:
self._classify_doc(self.docs, loading_end=True) self._classify_doc(self.docs, loading_end=True)
return self.docs return self.docs
@ -123,6 +125,8 @@ class PebbloSafeLoader(BaseLoader):
self.docs = [] self.docs = []
break break
self.docs = list((doc,)) self.docs = list((doc,))
# Add pebblo-specific metadata to docs
self._add_pebblo_specific_metadata()
if not self.load_semantic: if not self.load_semantic:
self._classify_doc(self.docs, loading_end=True) self._classify_doc(self.docs, loading_end=True)
yield self.docs[0] yield self.docs[0]
@ -517,3 +521,13 @@ class PebbloSafeLoader(BaseLoader):
classified_doc.get("topics", {}).keys() classified_doc.get("topics", {}).keys()
) )
return doc return doc
def _add_pebblo_specific_metadata(self) -> None:
"""Add Pebblo specific metadata to documents."""
for doc in self.docs:
doc_metadata = doc.metadata
doc_metadata["full_path"] = get_full_path(
doc_metadata.get(
"full_path", doc_metadata.get("source", self.source_path)
)
)

View File

@ -62,14 +62,15 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
post=MockResponse(json_data={"data": ""}, status_code=200), post=MockResponse(json_data={"data": ""}, status_code=200),
) )
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv") file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv")
full_file_path = os.path.abspath(file_path)
expected_docs = [ expected_docs = [
Document( Document(
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={"source": file_path, "row": 0}, metadata={"source": file_path, "row": 0, "full_path": full_file_path},
), ),
Document( Document(
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
metadata={"source": file_path, "row": 1}, metadata={"source": file_path, "row": 1, "full_path": full_file_path},
), ),
] ]