mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 17:08:47 +00:00
[Community][minor]: Updating source path, and file path for SharePoint loader in PebbloSafeLoader (#25592)
- **Description:** Updating source path and file path in Pebblo safe loader for SharePoint apps during loading - **Issue:** NA - **Dependencies:** NA - **Tests:** NA - **Docs** NA --------- Co-authored-by: dristy.cd <dristy@clouddefense.io>
This commit is contained in:
parent
745d1c2b8d
commit
fbb4761199
@ -252,11 +252,16 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
"""Add Pebblo specific metadata to documents."""
|
"""Add Pebblo specific metadata to documents."""
|
||||||
for doc in self.docs_with_id:
|
for doc in self.docs_with_id:
|
||||||
doc_metadata = doc.metadata
|
doc_metadata = doc.metadata
|
||||||
doc_metadata["full_path"] = get_full_path(
|
if self.loader.__class__.__name__ == "SharePointLoader":
|
||||||
doc_metadata.get(
|
doc_metadata["full_path"] = get_full_path(
|
||||||
"full_path", doc_metadata.get("source", self.source_path)
|
doc_metadata.get("source", self.source_path)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
doc_metadata["full_path"] = get_full_path(
|
||||||
|
doc_metadata.get(
|
||||||
|
"full_path", doc_metadata.get("source", self.source_path)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
|
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
|
||||||
"pb_checksum", None
|
"pb_checksum", None
|
||||||
)
|
)
|
||||||
|
@ -488,7 +488,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
source_owner = get_file_owner_from_path(source_path)
|
source_owner = get_file_owner_from_path(source_path)
|
||||||
# Prepare docs for classification
|
# Prepare docs for classification
|
||||||
docs, source_aggregate_size = self.prepare_docs_for_classification(
|
docs, source_aggregate_size = self.prepare_docs_for_classification(
|
||||||
docs_with_id, source_path
|
docs_with_id, source_path, loader_details
|
||||||
)
|
)
|
||||||
# Build payload for classification
|
# Build payload for classification
|
||||||
payload = self.build_classification_payload(
|
payload = self.build_classification_payload(
|
||||||
@ -659,7 +659,9 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_docs_for_classification(
|
def prepare_docs_for_classification(
|
||||||
docs_with_id: List[IndexedDocument], source_path: str
|
docs_with_id: List[IndexedDocument],
|
||||||
|
source_path: str,
|
||||||
|
loader_details: dict,
|
||||||
) -> Tuple[List[dict], int]:
|
) -> Tuple[List[dict], int]:
|
||||||
"""
|
"""
|
||||||
Prepare documents for classification.
|
Prepare documents for classification.
|
||||||
@ -667,22 +669,30 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
Args:
|
Args:
|
||||||
docs_with_id (List[IndexedDocument]): List of documents to be classified.
|
docs_with_id (List[IndexedDocument]): List of documents to be classified.
|
||||||
source_path (str): Source path of the documents.
|
source_path (str): Source path of the documents.
|
||||||
|
loader_details (dict): Contains loader info.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[List[dict], int]: Documents and the aggregate size of the source.
|
Tuple[List[dict], int]: Documents and the aggregate size
|
||||||
|
of the source.
|
||||||
"""
|
"""
|
||||||
docs = []
|
docs = []
|
||||||
source_aggregate_size = 0
|
source_aggregate_size = 0
|
||||||
doc_content = [doc.dict() for doc in docs_with_id]
|
doc_content = [doc.dict() for doc in docs_with_id]
|
||||||
|
source_path_update = False
|
||||||
for doc in doc_content:
|
for doc in doc_content:
|
||||||
doc_metadata = doc.get("metadata", {})
|
doc_metadata = doc.get("metadata", {})
|
||||||
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
|
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
|
||||||
doc_source_path = get_full_path(
|
if loader_details["loader"] == "SharePointLoader":
|
||||||
doc_metadata.get(
|
doc_source_path = get_full_path(
|
||||||
"full_path",
|
doc_metadata.get("source", loader_details["source_path"])
|
||||||
doc_metadata.get("source", source_path),
|
)
|
||||||
|
else:
|
||||||
|
doc_source_path = get_full_path(
|
||||||
|
doc_metadata.get(
|
||||||
|
"full_path",
|
||||||
|
doc_metadata.get("source", source_path),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
doc_source_owner = doc_metadata.get(
|
doc_source_owner = doc_metadata.get(
|
||||||
"owner", get_file_owner_from_path(doc_source_path)
|
"owner", get_file_owner_from_path(doc_source_path)
|
||||||
)
|
)
|
||||||
@ -710,6 +720,12 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
),
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
if (
|
||||||
|
loader_details["loader"] == "SharePointLoader"
|
||||||
|
and not source_path_update
|
||||||
|
):
|
||||||
|
loader_details["source_path"] = doc_metadata.get("source_full_url")
|
||||||
|
source_path_update = True
|
||||||
return docs, source_aggregate_size
|
return docs, source_aggregate_size
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
Loading…
Reference in New Issue
Block a user