From b4fa847a904d3b3ce7edb0952d95d87ae66ae8f4 Mon Sep 17 00:00:00 2001 From: Nejc Habjan Date: Fri, 16 Feb 2024 15:42:42 +0100 Subject: [PATCH] community[minor]: add exclude parameter to DirectoryLoader (#17316) - **Description:** adds an `exclude` parameter to the DirectoryLoader class, based on similar behavior in GenericLoader - **Issue:** discussed in https://github.com/langchain-ai/langchain/discussions/9059 and I think in some other issues that I cannot find at the moment :bow: - **Dependencies:** None - **Twitter handle:** don't have one sorry! Just https://github/nejch --------- Co-authored-by: Eugene Yurtsev --- .../document_loaders/directory.py | 40 ++++++++++++++++++- .../document_loaders/test_directory.py | 36 +++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/directory.py b/libs/community/langchain_community/document_loaders/directory.py index 7cfa456487c..3cb2ad1309a 100644 --- a/libs/community/langchain_community/document_loaders/directory.py +++ b/libs/community/langchain_community/document_loaders/directory.py @@ -2,7 +2,7 @@ import concurrent import logging import random from pathlib import Path -from typing import Any, List, Optional, Type, Union +from typing import Any, List, Optional, Sequence, Type, Union from langchain_core.documents import Document @@ -41,6 +41,7 @@ class DirectoryLoader(BaseLoader): use_multithreading: bool = False, max_concurrency: int = 4, *, + exclude: Union[Sequence[str], str] = (), sample_size: int = 0, randomize_sample: bool = False, sample_seed: Union[int, None] = None, @@ -51,6 +52,8 @@ class DirectoryLoader(BaseLoader): path: Path to directory. glob: Glob pattern to use to find files. Defaults to "**/[!.]*" (all files except hidden). + exclude: A pattern or list of patterns to exclude from results. + Use glob syntax. silent_errors: Whether to silently ignore errors. Defaults to False. load_hidden: Whether to load hidden files. Defaults to False. loader_cls: Loader class to use for loading files. @@ -64,11 +67,38 @@ class DirectoryLoader(BaseLoader): directory. randomize_sample: Shuffle the files to get a random sample. sample_seed: set the seed of the random shuffle for reproducibility. + + Examples: + + .. code-block:: python + from langchain_community.document_loaders import DirectoryLoader + + # Load all non-hidden files in a directory. + loader = DirectoryLoader("/path/to/directory") + + # Load all text files in a directory without recursion. + loader = DirectoryLoader("/path/to/directory", glob="*.txt") + + # Recursively load all text files in a directory. + loader = DirectoryLoader( + "/path/to/directory", glob="*.txt", recursive=True + ) + + # Load all files in a directory, except for py files. + loader = DirectoryLoader("/path/to/directory", exclude="*.py") + + # Load all files in a directory, except for py or pyc files. + loader = DirectoryLoader( + "/path/to/directory", exclude=["*.py", "*.pyc"] + ) """ if loader_kwargs is None: loader_kwargs = {} + if isinstance(exclude, str): + exclude = (exclude,) self.path = path self.glob = glob + self.exclude = exclude self.load_hidden = load_hidden self.loader_cls = loader_cls self.loader_kwargs = loader_kwargs @@ -118,7 +148,13 @@ class DirectoryLoader(BaseLoader): raise ValueError(f"Expected directory, got file: '{self.path}'") docs: List[Document] = [] - items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob)) + + paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob) + items = [ + path + for path in paths + if not (self.exclude and any(path.match(glob) for glob in self.exclude)) + ] if self.sample_size > 0: if self.randomize_sample: diff --git a/libs/community/tests/unit_tests/document_loaders/test_directory.py b/libs/community/tests/unit_tests/document_loaders/test_directory.py index dc028e4814c..f83e4bc2dfe 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_directory.py +++ b/libs/community/tests/unit_tests/document_loaders/test_directory.py @@ -1,4 +1,8 @@ +from pathlib import Path +from typing import Any, List + import pytest +from langchain_core.documents import Document from langchain_community.document_loaders import DirectoryLoader @@ -17,3 +21,35 @@ def test_raise_error_if_path_is_not_directory() -> None: loader.load() assert str(e.value) == f"Expected directory, got file: '{__file__}'" + + +class CustomLoader: + """Test loader. Mimics interface of existing file loader.""" + + def __init__(self, path: Path, **kwargs: Any) -> None: + """Initialize the loader.""" + self.path = path + + def load(self) -> List[Document]: + """Load documents.""" + with open(self.path, "r") as f: + return [Document(page_content=f.read())] + + +def test_exclude_ignores_matching_files(tmp_path: Path) -> None: + txt_file = tmp_path / "test.txt" + py_file = tmp_path / "test.py" + txt_file.touch() + py_file.touch() + loader = DirectoryLoader( + str(tmp_path), + exclude=["*.py"], + loader_cls=CustomLoader, # type: ignore + ) + data = loader.load() + assert len(data) == 1 + + +def test_exclude_as_string_converts_to_sequence() -> None: + loader = DirectoryLoader("./some_directory", exclude="*.py") + assert loader.exclude == ("*.py",)