Add excludes to FileSystemBlobLoader (#9064)

Add option to specify exclude patterns.

https://github.com/langchain-ai/langchain/discussions/9059
This commit is contained in:
Eugene Yurtsev 2023-08-10 14:56:58 -04:00 committed by GitHub
parent 6c70f491ba
commit b7bc8ec87f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 107 additions and 48 deletions

View File

@ -54,6 +54,7 @@ class FileSystemBlobLoader(BlobLoader):
path: Union[str, Path], path: Union[str, Path],
*, *,
glob: str = "**/[!.]*", glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None, suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False, show_progress: bool = False,
) -> None: ) -> None:
@ -63,6 +64,7 @@ class FileSystemBlobLoader(BlobLoader):
path: Path to directory to load from path: Path to directory to load from
glob: Glob pattern relative to the specified path glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files by default set to pick up all non-hidden files
exclude: patterns to exclude from results, use glob syntax
suffixes: Provide to keep only files with these suffixes suffixes: Provide to keep only files with these suffixes
Useful when wanting to keep files with different suffixes Useful when wanting to keep files with different suffixes
Suffixes must include the dot, e.g. ".txt" Suffixes must include the dot, e.g. ".txt"
@ -77,11 +79,21 @@ class FileSystemBlobLoader(BlobLoader):
# Recursively load all text files in a directory. # Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
# Recursively load all files in a directory, except for py or pyc files.
loader = FileSystemBlobLoader(
"/path/to/directory",
glob="**/*.txt",
exclude=["**/*.py", "**/*.pyc"]
)
# Recursively load all non-hidden files in a directory. # Recursively load all non-hidden files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
# Load all files in a directory without recursion. # Load all files in a directory without recursion.
loader = FileSystemBlobLoader("/path/to/directory", glob="*") loader = FileSystemBlobLoader("/path/to/directory", glob="*")
# Load all files in a directory without recursion.
""" """
if isinstance(path, Path): if isinstance(path, Path):
_path = path _path = path
@ -94,6 +106,7 @@ class FileSystemBlobLoader(BlobLoader):
self.glob = glob self.glob = glob
self.suffixes = set(suffixes or []) self.suffixes = set(suffixes or [])
self.show_progress = show_progress self.show_progress = show_progress
self.exclude = exclude
def yield_blobs( def yield_blobs(
self, self,
@ -110,6 +123,9 @@ class FileSystemBlobLoader(BlobLoader):
"""Yield paths that match the requested pattern.""" """Yield paths that match the requested pattern."""
paths = self.path.glob(self.glob) paths = self.path.glob(self.glob)
for path in paths: for path in paths:
if self.exclude:
if any(path.match(glob) for glob in self.exclude):
continue
if path.is_file(): if path.is_file():
if self.suffixes and path.suffix not in self.suffixes: if self.suffixes and path.suffix not in self.suffixes:
continue continue

View File

@ -2,7 +2,7 @@
import os import os
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Generator, Sequence from typing import Generator
import pytest import pytest
@ -42,56 +42,98 @@ def toy_dir() -> Generator[Path, None, None]:
yield Path(temp_dir) yield Path(temp_dir)
@pytest.mark.parametrize( _TEST_CASES = [
"glob, suffixes, relative_filenames", {
[ "glob": "**/[!.]*",
( "suffixes": None,
"**/[!.]*", "exclude": (),
None, "relative_filenames": [
[ "test.html",
"test.html", "test.txt",
"test.txt", "some_dir/nested_file.txt",
"some_dir/nested_file.txt", "some_dir/other_dir/more_nested.txt",
"some_dir/other_dir/more_nested.txt", ],
], },
), {
("*", None, ["test.html", "test.txt", ".hidden_file"]), "glob": "*",
("**/*.html", None, ["test.html"]), "suffixes": None,
("*/*.txt", None, ["some_dir/nested_file.txt"]), "exclude": (),
( "relative_filenames": ["test.html", "test.txt", ".hidden_file"],
"**/*.txt", },
None, {
[ "glob": "**/*.html",
"test.txt", "suffixes": None,
"some_dir/nested_file.txt", "exclude": (),
"some_dir/other_dir/more_nested.txt", "relative_filenames": ["test.html"],
], },
), {
( "glob": "*/*.txt",
"**/*", "suffixes": None,
[".txt"], "exclude": (),
[ "relative_filenames": ["some_dir/nested_file.txt"],
"test.txt", },
"some_dir/nested_file.txt", {
"some_dir/other_dir/more_nested.txt", "glob": "**/*.txt",
], "suffixes": None,
), "exclude": (),
("meeeeeeow", None, []), "relative_filenames": [
("*", [".html", ".txt"], ["test.html", "test.txt"]), "test.txt",
], "some_dir/nested_file.txt",
) "some_dir/other_dir/more_nested.txt",
def test_file_names_exist( ],
toy_dir: str, },
glob: str, {
suffixes: Sequence[str], "glob": "**/*",
relative_filenames: Sequence[str], "suffixes": [".txt"],
) -> None: "exclude": (),
"relative_filenames": [
"test.txt",
"some_dir/nested_file.txt",
"some_dir/other_dir/more_nested.txt",
],
},
{
"glob": "meeeeeeow",
"suffixes": None,
"exclude": (),
"relative_filenames": [],
},
{
"glob": "*",
"suffixes": [".html", ".txt"],
"exclude": (),
"relative_filenames": ["test.html", "test.txt"],
},
# Using exclude patterns
{
"glob": "**/*",
"suffixes": [".txt"],
"exclude": ("some_dir/*",),
"relative_filenames": ["test.txt", "some_dir/other_dir/more_nested.txt"],
},
# Using 2 exclude patterns, one of which is recursive
{
"glob": "**/*",
"suffixes": None,
"exclude": ("**/*.txt", ".hidden*"),
"relative_filenames": ["test.html"],
},
]
@pytest.mark.parametrize("params", _TEST_CASES)
def test_file_names_exist(toy_dir: str, params: dict) -> None:
"""Verify that the file names exist.""" """Verify that the file names exist."""
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes) glob_pattern = params["glob"]
blobs = list(loader.yield_blobs()) suffixes = params["suffixes"]
exclude = params["exclude"]
relative_filenames = params["relative_filenames"]
assert loader.count_matching_files() == len(relative_filenames) loader = FileSystemBlobLoader(
toy_dir, glob=glob_pattern, suffixes=suffixes, exclude=exclude
)
blobs = list(loader.yield_blobs())
file_names = sorted(str(blob.path) for blob in blobs) file_names = sorted(str(blob.path) for blob in blobs)
@ -101,6 +143,7 @@ def test_file_names_exist(
) )
assert file_names == expected_filenames assert file_names == expected_filenames
assert loader.count_matching_files() == len(relative_filenames)
@pytest.mark.requires("tqdm") @pytest.mark.requires("tqdm")