Add exclude to GenericLoader.from_file_system (#9539)

support exclude param in GenericLoader.from_filesystem

---------

Co-authored-by: Kyle Pancamo <50267605+KylePancamo@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Eugene Yurtsev 2023-08-23 19:09:10 -04:00 committed by GitHub
parent 278ef0bdcf
commit cd81e8a8f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 3 deletions

View File

@ -44,6 +44,7 @@ class ConcurrentLoader(GenericLoader):
path: _PathLike, path: _PathLike,
*, *,
glob: str = "**/[!.]*", glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None, suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False, show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default", parser: Union[DEFAULT, BaseBlobParser] = "default",
@ -52,12 +53,28 @@ class ConcurrentLoader(GenericLoader):
""" """
Create a concurrent generic document loader using a Create a concurrent generic document loader using a
filesystem blob loader. filesystem blob loader.
Args:
path: The path to the directory to load documents from.
glob: The glob pattern to use to find documents.
suffixes: The suffixes to use to filter documents. If None, all files
matching the glob will be loaded.
exclude: A list of patterns to exclude from the loader.
show_progress: Whether to show a progress bar or not (requires tqdm).
Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents
num_workers: Max number of concurrent workers to use.
""" """
blob_loader = FileSystemBlobLoader( blob_loader = FileSystemBlobLoader(
path, glob=glob, suffixes=suffixes, show_progress=show_progress path,
glob=glob,
exclude=exclude,
suffixes=suffixes,
show_progress=show_progress,
) )
if isinstance(parser, str): if isinstance(parser, str):
blob_parser = get_parser(parser) blob_parser = get_parser(parser)
else: else:
blob_parser = parser blob_parser = parser
return cls(blob_loader, blob_parser, num_workers) return cls(blob_loader, blob_parser, num_workers=num_workers)

View File

@ -105,6 +105,7 @@ class GenericLoader(BaseLoader):
path: _PathLike, path: _PathLike,
*, *,
glob: str = "**/[!.]*", glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None, suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False, show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default", parser: Union[DEFAULT, BaseBlobParser] = "default",
@ -116,6 +117,7 @@ class GenericLoader(BaseLoader):
glob: The glob pattern to use to find documents. glob: The glob pattern to use to find documents.
suffixes: The suffixes to use to filter documents. If None, all files suffixes: The suffixes to use to filter documents. If None, all files
matching the glob will be loaded. matching the glob will be loaded.
exclude: A list of patterns to exclude from the loader.
show_progress: Whether to show a progress bar or not (requires tqdm). show_progress: Whether to show a progress bar or not (requires tqdm).
Proxies to the file system loader. Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents parser: A blob parser which knows how to parse blobs into documents
@ -124,7 +126,11 @@ class GenericLoader(BaseLoader):
A generic document loader. A generic document loader.
""" """
blob_loader = FileSystemBlobLoader( blob_loader = FileSystemBlobLoader(
path, glob=glob, suffixes=suffixes, show_progress=show_progress path,
glob=glob,
exclude=exclude,
suffixes=suffixes,
show_progress=show_progress,
) )
if isinstance(parser, str): if isinstance(parser, str):
blob_parser = get_parser(parser) blob_parser = get_parser(parser)