diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 772a206a803..becb5d54859 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -252,11 +252,16 @@ class PebbloSafeLoader(BaseLoader): """Add Pebblo specific metadata to documents.""" for doc in self.docs_with_id: doc_metadata = doc.metadata - doc_metadata["full_path"] = get_full_path( - doc_metadata.get( - "full_path", doc_metadata.get("source", self.source_path) + if self.loader.__class__.__name__ == "SharePointLoader": + doc_metadata["full_path"] = get_full_path( + doc_metadata.get("source", self.source_path) + ) + else: + doc_metadata["full_path"] = get_full_path( + doc_metadata.get( + "full_path", doc_metadata.get("source", self.source_path) + ) ) - ) doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get( "pb_checksum", None ) diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py index 9e7f2b60d56..7f0dcfc5343 100644 --- a/libs/community/langchain_community/utilities/pebblo.py +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -488,7 +488,7 @@ class PebbloLoaderAPIWrapper(BaseModel): source_owner = get_file_owner_from_path(source_path) # Prepare docs for classification docs, source_aggregate_size = self.prepare_docs_for_classification( - docs_with_id, source_path + docs_with_id, source_path, loader_details ) # Build payload for classification payload = self.build_classification_payload( @@ -659,7 +659,9 @@ class PebbloLoaderAPIWrapper(BaseModel): @staticmethod def prepare_docs_for_classification( - docs_with_id: List[IndexedDocument], source_path: str + docs_with_id: List[IndexedDocument], + source_path: str, + loader_details: dict, ) -> Tuple[List[dict], int]: """ Prepare documents for classification. @@ -667,22 +669,30 @@ class PebbloLoaderAPIWrapper(BaseModel): Args: docs_with_id (List[IndexedDocument]): List of documents to be classified. source_path (str): Source path of the documents. + loader_details (dict): Contains loader info. Returns: - Tuple[List[dict], int]: Documents and the aggregate size of the source. + Tuple[List[dict], int]: Documents and the aggregate size + of the source. """ docs = [] source_aggregate_size = 0 doc_content = [doc.dict() for doc in docs_with_id] + source_path_update = False for doc in doc_content: doc_metadata = doc.get("metadata", {}) doc_authorized_identities = doc_metadata.get("authorized_identities", []) - doc_source_path = get_full_path( - doc_metadata.get( - "full_path", - doc_metadata.get("source", source_path), + if loader_details["loader"] == "SharePointLoader": + doc_source_path = get_full_path( + doc_metadata.get("source", loader_details["source_path"]) + ) + else: + doc_source_path = get_full_path( + doc_metadata.get( + "full_path", + doc_metadata.get("source", source_path), + ) ) - ) doc_source_owner = doc_metadata.get( "owner", get_file_owner_from_path(doc_source_path) ) @@ -710,6 +720,12 @@ class PebbloLoaderAPIWrapper(BaseModel): ), } ) + if ( + loader_details["loader"] == "SharePointLoader" + and not source_path_update + ): + loader_details["source_path"] = doc_metadata.get("source_full_url") + source_path_update = True return docs, source_aggregate_size @staticmethod