mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-02 09:40:26 +00:00
community[patch]: Ingest source, owner and full_path if present in Document's metadata. (#20949)
Description: The PebbloSafeLoader should first check for owner, full_path and size in metadata before implementing its own logic. Dependencies: None Documentation: NA. Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com> Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
790ea75cf7
commit
955cf186d2
@ -157,16 +157,19 @@ class PebbloSafeLoader(BaseLoader):
|
||||
doc_content = [doc.dict() for doc in loaded_docs]
|
||||
docs = []
|
||||
for doc in doc_content:
|
||||
doc_authorized_identities = doc.get("metadata", {}).get(
|
||||
"authorized_identities", []
|
||||
)
|
||||
doc_metadata = doc.get("metadata", {})
|
||||
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
|
||||
doc_source_path = get_full_path(
|
||||
doc.get("metadata", {}).get("source", self.source_path)
|
||||
doc_metadata.get(
|
||||
"full_path", doc_metadata.get("source", self.source_path)
|
||||
)
|
||||
)
|
||||
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
|
||||
doc_source_path
|
||||
doc_source_owner = doc_metadata.get(
|
||||
"owner", PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
|
||||
)
|
||||
doc_source_size = doc_metadata.get(
|
||||
"size", self.get_source_size(doc_source_path)
|
||||
)
|
||||
doc_source_size = self.get_source_size(doc_source_path)
|
||||
page_content = str(doc.get("page_content"))
|
||||
page_content_size = self.calculate_content_size(page_content)
|
||||
self.source_aggregate_size += page_content_size
|
||||
|
@ -169,7 +169,9 @@ def get_full_path(path: str) -> str:
|
||||
or (path in ["unknown", "-", "in-memory"])
|
||||
):
|
||||
return path
|
||||
full_path = pathlib.Path(path).resolve()
|
||||
full_path = pathlib.Path(path)
|
||||
if full_path.exists():
|
||||
full_path = full_path.resolve()
|
||||
return str(full_path)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user