[Community][minor]: Updating source path, and file path for SharePoint loader in PebbloSafeLoader (#25592)

- **Description:** Updating source path and file path in Pebblo safe
loader for SharePoint apps during loading
- **Issue:** NA
- **Dependencies:** NA
- **Tests:** NA
- **Docs** NA

---------

Co-authored-by: dristy.cd <dristy@clouddefense.io>
This commit is contained in:
Dristy Srivastava 2024-08-26 18:08:40 +05:30 committed by GitHub
parent 745d1c2b8d
commit fbb4761199
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 33 additions and 12 deletions

View File

@ -252,11 +252,16 @@ class PebbloSafeLoader(BaseLoader):
"""Add Pebblo specific metadata to documents.""" """Add Pebblo specific metadata to documents."""
for doc in self.docs_with_id: for doc in self.docs_with_id:
doc_metadata = doc.metadata doc_metadata = doc.metadata
doc_metadata["full_path"] = get_full_path( if self.loader.__class__.__name__ == "SharePointLoader":
doc_metadata.get( doc_metadata["full_path"] = get_full_path(
"full_path", doc_metadata.get("source", self.source_path) doc_metadata.get("source", self.source_path)
)
else:
doc_metadata["full_path"] = get_full_path(
doc_metadata.get(
"full_path", doc_metadata.get("source", self.source_path)
)
) )
)
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get( doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
"pb_checksum", None "pb_checksum", None
) )

View File

@ -488,7 +488,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
source_owner = get_file_owner_from_path(source_path) source_owner = get_file_owner_from_path(source_path)
# Prepare docs for classification # Prepare docs for classification
docs, source_aggregate_size = self.prepare_docs_for_classification( docs, source_aggregate_size = self.prepare_docs_for_classification(
docs_with_id, source_path docs_with_id, source_path, loader_details
) )
# Build payload for classification # Build payload for classification
payload = self.build_classification_payload( payload = self.build_classification_payload(
@ -659,7 +659,9 @@ class PebbloLoaderAPIWrapper(BaseModel):
@staticmethod @staticmethod
def prepare_docs_for_classification( def prepare_docs_for_classification(
docs_with_id: List[IndexedDocument], source_path: str docs_with_id: List[IndexedDocument],
source_path: str,
loader_details: dict,
) -> Tuple[List[dict], int]: ) -> Tuple[List[dict], int]:
""" """
Prepare documents for classification. Prepare documents for classification.
@ -667,22 +669,30 @@ class PebbloLoaderAPIWrapper(BaseModel):
Args: Args:
docs_with_id (List[IndexedDocument]): List of documents to be classified. docs_with_id (List[IndexedDocument]): List of documents to be classified.
source_path (str): Source path of the documents. source_path (str): Source path of the documents.
loader_details (dict): Contains loader info.
Returns: Returns:
Tuple[List[dict], int]: Documents and the aggregate size of the source. Tuple[List[dict], int]: Documents and the aggregate size
of the source.
""" """
docs = [] docs = []
source_aggregate_size = 0 source_aggregate_size = 0
doc_content = [doc.dict() for doc in docs_with_id] doc_content = [doc.dict() for doc in docs_with_id]
source_path_update = False
for doc in doc_content: for doc in doc_content:
doc_metadata = doc.get("metadata", {}) doc_metadata = doc.get("metadata", {})
doc_authorized_identities = doc_metadata.get("authorized_identities", []) doc_authorized_identities = doc_metadata.get("authorized_identities", [])
doc_source_path = get_full_path( if loader_details["loader"] == "SharePointLoader":
doc_metadata.get( doc_source_path = get_full_path(
"full_path", doc_metadata.get("source", loader_details["source_path"])
doc_metadata.get("source", source_path), )
else:
doc_source_path = get_full_path(
doc_metadata.get(
"full_path",
doc_metadata.get("source", source_path),
)
) )
)
doc_source_owner = doc_metadata.get( doc_source_owner = doc_metadata.get(
"owner", get_file_owner_from_path(doc_source_path) "owner", get_file_owner_from_path(doc_source_path)
) )
@ -710,6 +720,12 @@ class PebbloLoaderAPIWrapper(BaseModel):
), ),
} }
) )
if (
loader_details["loader"] == "SharePointLoader"
and not source_path_update
):
loader_details["source_path"] = doc_metadata.get("source_full_url")
source_path_update = True
return docs, source_aggregate_size return docs, source_aggregate_size
@staticmethod @staticmethod