mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-18 09:01:03 +00:00
Add excludes to FileSystemBlobLoader (#9064)
Add option to specify exclude patterns. https://github.com/langchain-ai/langchain/discussions/9059
This commit is contained in:
parent
6c70f491ba
commit
b7bc8ec87f
@ -54,6 +54,7 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
path: Union[str, Path],
|
path: Union[str, Path],
|
||||||
*,
|
*,
|
||||||
glob: str = "**/[!.]*",
|
glob: str = "**/[!.]*",
|
||||||
|
exclude: Sequence[str] = (),
|
||||||
suffixes: Optional[Sequence[str]] = None,
|
suffixes: Optional[Sequence[str]] = None,
|
||||||
show_progress: bool = False,
|
show_progress: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -63,6 +64,7 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
path: Path to directory to load from
|
path: Path to directory to load from
|
||||||
glob: Glob pattern relative to the specified path
|
glob: Glob pattern relative to the specified path
|
||||||
by default set to pick up all non-hidden files
|
by default set to pick up all non-hidden files
|
||||||
|
exclude: patterns to exclude from results, use glob syntax
|
||||||
suffixes: Provide to keep only files with these suffixes
|
suffixes: Provide to keep only files with these suffixes
|
||||||
Useful when wanting to keep files with different suffixes
|
Useful when wanting to keep files with different suffixes
|
||||||
Suffixes must include the dot, e.g. ".txt"
|
Suffixes must include the dot, e.g. ".txt"
|
||||||
@ -77,11 +79,21 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
# Recursively load all text files in a directory.
|
# Recursively load all text files in a directory.
|
||||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||||
|
|
||||||
|
# Recursively load all files in a directory, except for py or pyc files.
|
||||||
|
loader = FileSystemBlobLoader(
|
||||||
|
"/path/to/directory",
|
||||||
|
glob="**/*.txt",
|
||||||
|
exclude=["**/*.py", "**/*.pyc"]
|
||||||
|
)
|
||||||
|
|
||||||
# Recursively load all non-hidden files in a directory.
|
# Recursively load all non-hidden files in a directory.
|
||||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
|
||||||
|
|
||||||
# Load all files in a directory without recursion.
|
# Load all files in a directory without recursion.
|
||||||
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
|
||||||
|
|
||||||
|
# Load all files in a directory without recursion.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if isinstance(path, Path):
|
if isinstance(path, Path):
|
||||||
_path = path
|
_path = path
|
||||||
@ -94,6 +106,7 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
self.glob = glob
|
self.glob = glob
|
||||||
self.suffixes = set(suffixes or [])
|
self.suffixes = set(suffixes or [])
|
||||||
self.show_progress = show_progress
|
self.show_progress = show_progress
|
||||||
|
self.exclude = exclude
|
||||||
|
|
||||||
def yield_blobs(
|
def yield_blobs(
|
||||||
self,
|
self,
|
||||||
@ -110,6 +123,9 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
"""Yield paths that match the requested pattern."""
|
"""Yield paths that match the requested pattern."""
|
||||||
paths = self.path.glob(self.glob)
|
paths = self.path.glob(self.glob)
|
||||||
for path in paths:
|
for path in paths:
|
||||||
|
if self.exclude:
|
||||||
|
if any(path.match(glob) for glob in self.exclude):
|
||||||
|
continue
|
||||||
if path.is_file():
|
if path.is_file():
|
||||||
if self.suffixes and path.suffix not in self.suffixes:
|
if self.suffixes and path.suffix not in self.suffixes:
|
||||||
continue
|
continue
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Generator, Sequence
|
from typing import Generator
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -42,56 +42,98 @@ def toy_dir() -> Generator[Path, None, None]:
|
|||||||
yield Path(temp_dir)
|
yield Path(temp_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
_TEST_CASES = [
|
||||||
"glob, suffixes, relative_filenames",
|
{
|
||||||
[
|
"glob": "**/[!.]*",
|
||||||
(
|
"suffixes": None,
|
||||||
"**/[!.]*",
|
"exclude": (),
|
||||||
None,
|
"relative_filenames": [
|
||||||
[
|
|
||||||
"test.html",
|
"test.html",
|
||||||
"test.txt",
|
"test.txt",
|
||||||
"some_dir/nested_file.txt",
|
"some_dir/nested_file.txt",
|
||||||
"some_dir/other_dir/more_nested.txt",
|
"some_dir/other_dir/more_nested.txt",
|
||||||
],
|
],
|
||||||
),
|
},
|
||||||
("*", None, ["test.html", "test.txt", ".hidden_file"]),
|
{
|
||||||
("**/*.html", None, ["test.html"]),
|
"glob": "*",
|
||||||
("*/*.txt", None, ["some_dir/nested_file.txt"]),
|
"suffixes": None,
|
||||||
(
|
"exclude": (),
|
||||||
"**/*.txt",
|
"relative_filenames": ["test.html", "test.txt", ".hidden_file"],
|
||||||
None,
|
},
|
||||||
[
|
{
|
||||||
|
"glob": "**/*.html",
|
||||||
|
"suffixes": None,
|
||||||
|
"exclude": (),
|
||||||
|
"relative_filenames": ["test.html"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"glob": "*/*.txt",
|
||||||
|
"suffixes": None,
|
||||||
|
"exclude": (),
|
||||||
|
"relative_filenames": ["some_dir/nested_file.txt"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"glob": "**/*.txt",
|
||||||
|
"suffixes": None,
|
||||||
|
"exclude": (),
|
||||||
|
"relative_filenames": [
|
||||||
"test.txt",
|
"test.txt",
|
||||||
"some_dir/nested_file.txt",
|
"some_dir/nested_file.txt",
|
||||||
"some_dir/other_dir/more_nested.txt",
|
"some_dir/other_dir/more_nested.txt",
|
||||||
],
|
],
|
||||||
),
|
},
|
||||||
(
|
{
|
||||||
"**/*",
|
"glob": "**/*",
|
||||||
[".txt"],
|
"suffixes": [".txt"],
|
||||||
[
|
"exclude": (),
|
||||||
|
"relative_filenames": [
|
||||||
"test.txt",
|
"test.txt",
|
||||||
"some_dir/nested_file.txt",
|
"some_dir/nested_file.txt",
|
||||||
"some_dir/other_dir/more_nested.txt",
|
"some_dir/other_dir/more_nested.txt",
|
||||||
],
|
],
|
||||||
),
|
},
|
||||||
("meeeeeeow", None, []),
|
{
|
||||||
("*", [".html", ".txt"], ["test.html", "test.txt"]),
|
"glob": "meeeeeeow",
|
||||||
],
|
"suffixes": None,
|
||||||
)
|
"exclude": (),
|
||||||
def test_file_names_exist(
|
"relative_filenames": [],
|
||||||
toy_dir: str,
|
},
|
||||||
glob: str,
|
{
|
||||||
suffixes: Sequence[str],
|
"glob": "*",
|
||||||
relative_filenames: Sequence[str],
|
"suffixes": [".html", ".txt"],
|
||||||
) -> None:
|
"exclude": (),
|
||||||
|
"relative_filenames": ["test.html", "test.txt"],
|
||||||
|
},
|
||||||
|
# Using exclude patterns
|
||||||
|
{
|
||||||
|
"glob": "**/*",
|
||||||
|
"suffixes": [".txt"],
|
||||||
|
"exclude": ("some_dir/*",),
|
||||||
|
"relative_filenames": ["test.txt", "some_dir/other_dir/more_nested.txt"],
|
||||||
|
},
|
||||||
|
# Using 2 exclude patterns, one of which is recursive
|
||||||
|
{
|
||||||
|
"glob": "**/*",
|
||||||
|
"suffixes": None,
|
||||||
|
"exclude": ("**/*.txt", ".hidden*"),
|
||||||
|
"relative_filenames": ["test.html"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("params", _TEST_CASES)
|
||||||
|
def test_file_names_exist(toy_dir: str, params: dict) -> None:
|
||||||
"""Verify that the file names exist."""
|
"""Verify that the file names exist."""
|
||||||
|
|
||||||
loader = FileSystemBlobLoader(toy_dir, glob=glob, suffixes=suffixes)
|
glob_pattern = params["glob"]
|
||||||
blobs = list(loader.yield_blobs())
|
suffixes = params["suffixes"]
|
||||||
|
exclude = params["exclude"]
|
||||||
|
relative_filenames = params["relative_filenames"]
|
||||||
|
|
||||||
assert loader.count_matching_files() == len(relative_filenames)
|
loader = FileSystemBlobLoader(
|
||||||
|
toy_dir, glob=glob_pattern, suffixes=suffixes, exclude=exclude
|
||||||
|
)
|
||||||
|
blobs = list(loader.yield_blobs())
|
||||||
|
|
||||||
file_names = sorted(str(blob.path) for blob in blobs)
|
file_names = sorted(str(blob.path) for blob in blobs)
|
||||||
|
|
||||||
@ -101,6 +143,7 @@ def test_file_names_exist(
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert file_names == expected_filenames
|
assert file_names == expected_filenames
|
||||||
|
assert loader.count_matching_files() == len(relative_filenames)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("tqdm")
|
@pytest.mark.requires("tqdm")
|
||||||
|
Loading…
Reference in New Issue
Block a user