mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 00:23:25 +00:00
community[patch]: Fix source path mismatch in PebbloSafeLoader (#23857)
**Description:** Fix for source path mismatch in PebbloSafeLoader. The fix involves storing the full path in the doc metadata in VectorDB **Issue:** NA, caught in internal testing **Dependencies:** NA **Add tests**: Updated tests
This commit is contained in:
parent
5b7d5f7729
commit
ee8aa54f53
@ -89,6 +89,8 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
list: Documents fetched from load method of the wrapped `loader`.
|
list: Documents fetched from load method of the wrapped `loader`.
|
||||||
"""
|
"""
|
||||||
self.docs = self.loader.load()
|
self.docs = self.loader.load()
|
||||||
|
# Add pebblo-specific metadata to docs
|
||||||
|
self._add_pebblo_specific_metadata()
|
||||||
if not self.load_semantic:
|
if not self.load_semantic:
|
||||||
self._classify_doc(self.docs, loading_end=True)
|
self._classify_doc(self.docs, loading_end=True)
|
||||||
return self.docs
|
return self.docs
|
||||||
@ -123,6 +125,8 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
self.docs = []
|
self.docs = []
|
||||||
break
|
break
|
||||||
self.docs = list((doc,))
|
self.docs = list((doc,))
|
||||||
|
# Add pebblo-specific metadata to docs
|
||||||
|
self._add_pebblo_specific_metadata()
|
||||||
if not self.load_semantic:
|
if not self.load_semantic:
|
||||||
self._classify_doc(self.docs, loading_end=True)
|
self._classify_doc(self.docs, loading_end=True)
|
||||||
yield self.docs[0]
|
yield self.docs[0]
|
||||||
@ -517,3 +521,13 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
classified_doc.get("topics", {}).keys()
|
classified_doc.get("topics", {}).keys()
|
||||||
)
|
)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def _add_pebblo_specific_metadata(self) -> None:
|
||||||
|
"""Add Pebblo specific metadata to documents."""
|
||||||
|
for doc in self.docs:
|
||||||
|
doc_metadata = doc.metadata
|
||||||
|
doc_metadata["full_path"] = get_full_path(
|
||||||
|
doc_metadata.get(
|
||||||
|
"full_path", doc_metadata.get("source", self.source_path)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@ -62,14 +62,15 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
|
|||||||
post=MockResponse(json_data={"data": ""}, status_code=200),
|
post=MockResponse(json_data={"data": ""}, status_code=200),
|
||||||
)
|
)
|
||||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv")
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_nominal.csv")
|
||||||
|
full_file_path = os.path.abspath(file_path)
|
||||||
expected_docs = [
|
expected_docs = [
|
||||||
Document(
|
Document(
|
||||||
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
|
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
|
||||||
metadata={"source": file_path, "row": 0},
|
metadata={"source": file_path, "row": 0, "full_path": full_file_path},
|
||||||
),
|
),
|
||||||
Document(
|
Document(
|
||||||
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
|
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
|
||||||
metadata={"source": file_path, "row": 1},
|
metadata={"source": file_path, "row": 1, "full_path": full_file_path},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user