mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 21:33:51 +00:00
community[minor]: Add support for non-file-based Document Loaders in PebbloSafeLoader (#19574)
**Description:** PebbloSafeLoader: Add support for non-file-based Document Loaders This pull request enhances PebbloSafeLoader by introducing support for several non-file-based Document Loaders. With this update, PebbloSafeLoader now seamlessly integrates with the following loaders: - GoogleDriveLoader - SlackDirectoryLoader - Unstructured EmailLoader **Issue:** NA **Dependencies:** - None **Twitter handle:** @Raj__725 --------- Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
9954c6a38e
commit
0019d8a948
@ -125,7 +125,9 @@ class PebbloSafeLoader(BaseLoader):
|
||||
doc_content = [doc.dict() for doc in self.docs]
|
||||
docs = []
|
||||
for doc in doc_content:
|
||||
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
|
||||
doc_source_path = get_full_path(
|
||||
doc.get("metadata", {}).get("source", self.source_path)
|
||||
)
|
||||
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
|
||||
doc_source_path
|
||||
)
|
||||
|
@ -29,11 +29,28 @@ file_loader = [
|
||||
"AmazonTextractPDFLoader",
|
||||
"CSVLoader",
|
||||
"UnstructuredExcelLoader",
|
||||
"UnstructuredEmailLoader",
|
||||
]
|
||||
dir_loader = [
|
||||
"DirectoryLoader",
|
||||
"S3DirLoader",
|
||||
"SlackDirectoryLoader",
|
||||
"PyPDFDirectoryLoader",
|
||||
"NotionDirectoryLoader",
|
||||
]
|
||||
dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"]
|
||||
in_memory = ["DataFrameLoader"]
|
||||
|
||||
LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory}
|
||||
in_memory = ["DataFrameLoader"]
|
||||
remote_db = [
|
||||
"NotionDBLoader",
|
||||
"GoogleDriveLoader",
|
||||
]
|
||||
|
||||
LOADER_TYPE_MAPPING = {
|
||||
"file": file_loader,
|
||||
"dir": dir_loader,
|
||||
"in-memory": in_memory,
|
||||
"remote_db": remote_db,
|
||||
}
|
||||
|
||||
SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
|
||||
|
||||
@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str:
|
||||
for loader_type, loaders in LOADER_TYPE_MAPPING.items():
|
||||
if loader in loaders:
|
||||
return loader_type
|
||||
return "unknown"
|
||||
return "unsupported"
|
||||
|
||||
|
||||
def get_loader_full_path(loader: BaseLoader) -> str:
|
||||
@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str:
|
||||
from langchain_community.document_loaders import (
|
||||
DataFrameLoader,
|
||||
GCSFileLoader,
|
||||
NotionDBLoader,
|
||||
S3FileLoader,
|
||||
)
|
||||
|
||||
@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str:
|
||||
location = f"gc://{loader.bucket}/{loader.blob}"
|
||||
elif isinstance(loader, S3FileLoader):
|
||||
location = f"s3://{loader.bucket}/{loader.key}"
|
||||
elif "source" in loader_dict:
|
||||
location = loader_dict["source"]
|
||||
if location and "channel" in loader_dict:
|
||||
channel = loader_dict["channel"]
|
||||
if channel:
|
||||
location = f"{location}/{channel}"
|
||||
elif "path" in loader_dict:
|
||||
location = loader_dict["path"]
|
||||
elif "file_path" in loader_dict:
|
||||
location = loader_dict["file_path"]
|
||||
elif "web_paths" in loader_dict:
|
||||
location = loader_dict["web_paths"][0]
|
||||
web_paths = loader_dict["web_paths"]
|
||||
if web_paths and isinstance(web_paths, list) and len(web_paths) > 0:
|
||||
location = web_paths[0]
|
||||
# For in-memory types:
|
||||
elif isinstance(loader, DataFrameLoader):
|
||||
location = "in-memory"
|
||||
elif isinstance(loader, NotionDBLoader):
|
||||
location = f"notiondb://{loader.database_id}"
|
||||
except Exception:
|
||||
pass
|
||||
return get_full_path(str(location))
|
||||
|
Loading…
Reference in New Issue
Block a user