mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
community[minor]: Add support for non-file-based Document Loaders in PebbloSafeLoader (#19574)
**Description:** PebbloSafeLoader: Add support for non-file-based Document Loaders This pull request enhances PebbloSafeLoader by introducing support for several non-file-based Document Loaders. With this update, PebbloSafeLoader now seamlessly integrates with the following loaders: - GoogleDriveLoader - SlackDirectoryLoader - Unstructured EmailLoader **Issue:** NA **Dependencies:** - None **Twitter handle:** @Raj__725 --------- Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
9954c6a38e
commit
0019d8a948
@ -125,7 +125,9 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
doc_content = [doc.dict() for doc in self.docs]
|
doc_content = [doc.dict() for doc in self.docs]
|
||||||
docs = []
|
docs = []
|
||||||
for doc in doc_content:
|
for doc in doc_content:
|
||||||
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
|
doc_source_path = get_full_path(
|
||||||
|
doc.get("metadata", {}).get("source", self.source_path)
|
||||||
|
)
|
||||||
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
|
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
|
||||||
doc_source_path
|
doc_source_path
|
||||||
)
|
)
|
||||||
|
@ -29,11 +29,28 @@ file_loader = [
|
|||||||
"AmazonTextractPDFLoader",
|
"AmazonTextractPDFLoader",
|
||||||
"CSVLoader",
|
"CSVLoader",
|
||||||
"UnstructuredExcelLoader",
|
"UnstructuredExcelLoader",
|
||||||
|
"UnstructuredEmailLoader",
|
||||||
|
]
|
||||||
|
dir_loader = [
|
||||||
|
"DirectoryLoader",
|
||||||
|
"S3DirLoader",
|
||||||
|
"SlackDirectoryLoader",
|
||||||
|
"PyPDFDirectoryLoader",
|
||||||
|
"NotionDirectoryLoader",
|
||||||
]
|
]
|
||||||
dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"]
|
|
||||||
in_memory = ["DataFrameLoader"]
|
|
||||||
|
|
||||||
LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory}
|
in_memory = ["DataFrameLoader"]
|
||||||
|
remote_db = [
|
||||||
|
"NotionDBLoader",
|
||||||
|
"GoogleDriveLoader",
|
||||||
|
]
|
||||||
|
|
||||||
|
LOADER_TYPE_MAPPING = {
|
||||||
|
"file": file_loader,
|
||||||
|
"dir": dir_loader,
|
||||||
|
"in-memory": in_memory,
|
||||||
|
"remote_db": remote_db,
|
||||||
|
}
|
||||||
|
|
||||||
SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
|
SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
|
||||||
|
|
||||||
@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str:
|
|||||||
for loader_type, loaders in LOADER_TYPE_MAPPING.items():
|
for loader_type, loaders in LOADER_TYPE_MAPPING.items():
|
||||||
if loader in loaders:
|
if loader in loaders:
|
||||||
return loader_type
|
return loader_type
|
||||||
return "unknown"
|
return "unsupported"
|
||||||
|
|
||||||
|
|
||||||
def get_loader_full_path(loader: BaseLoader) -> str:
|
def get_loader_full_path(loader: BaseLoader) -> str:
|
||||||
@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str:
|
|||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
DataFrameLoader,
|
DataFrameLoader,
|
||||||
GCSFileLoader,
|
GCSFileLoader,
|
||||||
|
NotionDBLoader,
|
||||||
S3FileLoader,
|
S3FileLoader,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str:
|
|||||||
location = f"gc://{loader.bucket}/{loader.blob}"
|
location = f"gc://{loader.bucket}/{loader.blob}"
|
||||||
elif isinstance(loader, S3FileLoader):
|
elif isinstance(loader, S3FileLoader):
|
||||||
location = f"s3://{loader.bucket}/{loader.key}"
|
location = f"s3://{loader.bucket}/{loader.key}"
|
||||||
|
elif "source" in loader_dict:
|
||||||
|
location = loader_dict["source"]
|
||||||
|
if location and "channel" in loader_dict:
|
||||||
|
channel = loader_dict["channel"]
|
||||||
|
if channel:
|
||||||
|
location = f"{location}/{channel}"
|
||||||
elif "path" in loader_dict:
|
elif "path" in loader_dict:
|
||||||
location = loader_dict["path"]
|
location = loader_dict["path"]
|
||||||
elif "file_path" in loader_dict:
|
elif "file_path" in loader_dict:
|
||||||
location = loader_dict["file_path"]
|
location = loader_dict["file_path"]
|
||||||
elif "web_paths" in loader_dict:
|
elif "web_paths" in loader_dict:
|
||||||
location = loader_dict["web_paths"][0]
|
web_paths = loader_dict["web_paths"]
|
||||||
|
if web_paths and isinstance(web_paths, list) and len(web_paths) > 0:
|
||||||
|
location = web_paths[0]
|
||||||
# For in-memory types:
|
# For in-memory types:
|
||||||
elif isinstance(loader, DataFrameLoader):
|
elif isinstance(loader, DataFrameLoader):
|
||||||
location = "in-memory"
|
location = "in-memory"
|
||||||
|
elif isinstance(loader, NotionDBLoader):
|
||||||
|
location = f"notiondb://{loader.database_id}"
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return get_full_path(str(location))
|
return get_full_path(str(location))
|
||||||
|
Loading…
Reference in New Issue
Block a user