mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-08 12:31:49 +00:00
community[patch]: Ingest source, owner and full_path if present in Document's metadata. (#20949)
Description: The PebbloSafeLoader should first check for owner, full_path and size in metadata before implementing its own logic. Dependencies: None Documentation: NA. Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com> Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
790ea75cf7
commit
955cf186d2
@ -157,16 +157,19 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
doc_content = [doc.dict() for doc in loaded_docs]
|
doc_content = [doc.dict() for doc in loaded_docs]
|
||||||
docs = []
|
docs = []
|
||||||
for doc in doc_content:
|
for doc in doc_content:
|
||||||
doc_authorized_identities = doc.get("metadata", {}).get(
|
doc_metadata = doc.get("metadata", {})
|
||||||
"authorized_identities", []
|
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
|
||||||
)
|
|
||||||
doc_source_path = get_full_path(
|
doc_source_path = get_full_path(
|
||||||
doc.get("metadata", {}).get("source", self.source_path)
|
doc_metadata.get(
|
||||||
|
"full_path", doc_metadata.get("source", self.source_path)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
|
doc_source_owner = doc_metadata.get(
|
||||||
doc_source_path
|
"owner", PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
|
||||||
|
)
|
||||||
|
doc_source_size = doc_metadata.get(
|
||||||
|
"size", self.get_source_size(doc_source_path)
|
||||||
)
|
)
|
||||||
doc_source_size = self.get_source_size(doc_source_path)
|
|
||||||
page_content = str(doc.get("page_content"))
|
page_content = str(doc.get("page_content"))
|
||||||
page_content_size = self.calculate_content_size(page_content)
|
page_content_size = self.calculate_content_size(page_content)
|
||||||
self.source_aggregate_size += page_content_size
|
self.source_aggregate_size += page_content_size
|
||||||
|
@ -169,7 +169,9 @@ def get_full_path(path: str) -> str:
|
|||||||
or (path in ["unknown", "-", "in-memory"])
|
or (path in ["unknown", "-", "in-memory"])
|
||||||
):
|
):
|
||||||
return path
|
return path
|
||||||
full_path = pathlib.Path(path).resolve()
|
full_path = pathlib.Path(path)
|
||||||
|
if full_path.exists():
|
||||||
|
full_path = full_path.resolve()
|
||||||
return str(full_path)
|
return str(full_path)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user