From cd81e8a8f22d7f4c99db884632df769231538d2b Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 23 Aug 2023 19:09:10 -0400 Subject: [PATCH] Add exclude to GenericLoader.from_file_system (#9539) support exclude param in GenericLoader.from_filesystem --------- Co-authored-by: Kyle Pancamo <50267605+KylePancamo@users.noreply.github.com> Co-authored-by: Bagatur --- .../langchain/document_loaders/concurrent.py | 21 +++++++++++++++++-- .../langchain/document_loaders/generic.py | 8 ++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/concurrent.py b/libs/langchain/langchain/document_loaders/concurrent.py index 2044dcc7fbb..bb55d670000 100644 --- a/libs/langchain/langchain/document_loaders/concurrent.py +++ b/libs/langchain/langchain/document_loaders/concurrent.py @@ -44,6 +44,7 @@ class ConcurrentLoader(GenericLoader): path: _PathLike, *, glob: str = "**/[!.]*", + exclude: Sequence[str] = (), suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, parser: Union[DEFAULT, BaseBlobParser] = "default", @@ -52,12 +53,28 @@ class ConcurrentLoader(GenericLoader): """ Create a concurrent generic document loader using a filesystem blob loader. + + + Args: + path: The path to the directory to load documents from. + glob: The glob pattern to use to find documents. + suffixes: The suffixes to use to filter documents. If None, all files + matching the glob will be loaded. + exclude: A list of patterns to exclude from the loader. + show_progress: Whether to show a progress bar or not (requires tqdm). + Proxies to the file system loader. + parser: A blob parser which knows how to parse blobs into documents + num_workers: Max number of concurrent workers to use. """ blob_loader = FileSystemBlobLoader( - path, glob=glob, suffixes=suffixes, show_progress=show_progress + path, + glob=glob, + exclude=exclude, + suffixes=suffixes, + show_progress=show_progress, ) if isinstance(parser, str): blob_parser = get_parser(parser) else: blob_parser = parser - return cls(blob_loader, blob_parser, num_workers) + return cls(blob_loader, blob_parser, num_workers=num_workers) diff --git a/libs/langchain/langchain/document_loaders/generic.py b/libs/langchain/langchain/document_loaders/generic.py index 2728e0ae9fe..26d7577a332 100644 --- a/libs/langchain/langchain/document_loaders/generic.py +++ b/libs/langchain/langchain/document_loaders/generic.py @@ -105,6 +105,7 @@ class GenericLoader(BaseLoader): path: _PathLike, *, glob: str = "**/[!.]*", + exclude: Sequence[str] = (), suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, parser: Union[DEFAULT, BaseBlobParser] = "default", @@ -116,6 +117,7 @@ class GenericLoader(BaseLoader): glob: The glob pattern to use to find documents. suffixes: The suffixes to use to filter documents. If None, all files matching the glob will be loaded. + exclude: A list of patterns to exclude from the loader. show_progress: Whether to show a progress bar or not (requires tqdm). Proxies to the file system loader. parser: A blob parser which knows how to parse blobs into documents @@ -124,7 +126,11 @@ class GenericLoader(BaseLoader): A generic document loader. """ blob_loader = FileSystemBlobLoader( - path, glob=glob, suffixes=suffixes, show_progress=show_progress + path, + glob=glob, + exclude=exclude, + suffixes=suffixes, + show_progress=show_progress, ) if isinstance(parser, str): blob_parser = get_parser(parser)