community[minor]: add exclude parameter to DirectoryLoader (#17316)

- **Description:** adds an `exclude` parameter to the DirectoryLoader
class, based on similar behavior in GenericLoader
- **Issue:** discussed in
https://github.com/langchain-ai/langchain/discussions/9059 and I think
in some other issues that I cannot find at the moment 🙇
  - **Dependencies:** None
  - **Twitter handle:** don't have one sorry! Just https://github/nejch

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Nejc Habjan 2024-02-16 15:42:42 +01:00 committed by GitHub
parent 8f14234afb
commit b4fa847a90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 74 additions and 2 deletions

View File

@ -2,7 +2,7 @@ import concurrent
import logging import logging
import random import random
from pathlib import Path from pathlib import Path
from typing import Any, List, Optional, Type, Union from typing import Any, List, Optional, Sequence, Type, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@ -41,6 +41,7 @@ class DirectoryLoader(BaseLoader):
use_multithreading: bool = False, use_multithreading: bool = False,
max_concurrency: int = 4, max_concurrency: int = 4,
*, *,
exclude: Union[Sequence[str], str] = (),
sample_size: int = 0, sample_size: int = 0,
randomize_sample: bool = False, randomize_sample: bool = False,
sample_seed: Union[int, None] = None, sample_seed: Union[int, None] = None,
@ -51,6 +52,8 @@ class DirectoryLoader(BaseLoader):
path: Path to directory. path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*" glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden). (all files except hidden).
exclude: A pattern or list of patterns to exclude from results.
Use glob syntax.
silent_errors: Whether to silently ignore errors. Defaults to False. silent_errors: Whether to silently ignore errors. Defaults to False.
load_hidden: Whether to load hidden files. Defaults to False. load_hidden: Whether to load hidden files. Defaults to False.
loader_cls: Loader class to use for loading files. loader_cls: Loader class to use for loading files.
@ -64,11 +67,38 @@ class DirectoryLoader(BaseLoader):
directory. directory.
randomize_sample: Shuffle the files to get a random sample. randomize_sample: Shuffle the files to get a random sample.
sample_seed: set the seed of the random shuffle for reproducibility. sample_seed: set the seed of the random shuffle for reproducibility.
Examples:
.. code-block:: python
from langchain_community.document_loaders import DirectoryLoader
# Load all non-hidden files in a directory.
loader = DirectoryLoader("/path/to/directory")
# Load all text files in a directory without recursion.
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
# Recursively load all text files in a directory.
loader = DirectoryLoader(
"/path/to/directory", glob="*.txt", recursive=True
)
# Load all files in a directory, except for py files.
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
# Load all files in a directory, except for py or pyc files.
loader = DirectoryLoader(
"/path/to/directory", exclude=["*.py", "*.pyc"]
)
""" """
if loader_kwargs is None: if loader_kwargs is None:
loader_kwargs = {} loader_kwargs = {}
if isinstance(exclude, str):
exclude = (exclude,)
self.path = path self.path = path
self.glob = glob self.glob = glob
self.exclude = exclude
self.load_hidden = load_hidden self.load_hidden = load_hidden
self.loader_cls = loader_cls self.loader_cls = loader_cls
self.loader_kwargs = loader_kwargs self.loader_kwargs = loader_kwargs
@ -118,7 +148,13 @@ class DirectoryLoader(BaseLoader):
raise ValueError(f"Expected directory, got file: '{self.path}'") raise ValueError(f"Expected directory, got file: '{self.path}'")
docs: List[Document] = [] docs: List[Document] = []
items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
items = [
path
for path in paths
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
]
if self.sample_size > 0: if self.sample_size > 0:
if self.randomize_sample: if self.randomize_sample:

View File

@ -1,4 +1,8 @@
from pathlib import Path
from typing import Any, List
import pytest import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader from langchain_community.document_loaders import DirectoryLoader
@ -17,3 +21,35 @@ def test_raise_error_if_path_is_not_directory() -> None:
loader.load() loader.load()
assert str(e.value) == f"Expected directory, got file: '{__file__}'" assert str(e.value) == f"Expected directory, got file: '{__file__}'"
class CustomLoader:
"""Test loader. Mimics interface of existing file loader."""
def __init__(self, path: Path, **kwargs: Any) -> None:
"""Initialize the loader."""
self.path = path
def load(self) -> List[Document]:
"""Load documents."""
with open(self.path, "r") as f:
return [Document(page_content=f.read())]
def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
txt_file = tmp_path / "test.txt"
py_file = tmp_path / "test.py"
txt_file.touch()
py_file.touch()
loader = DirectoryLoader(
str(tmp_path),
exclude=["*.py"],
loader_cls=CustomLoader, # type: ignore
)
data = loader.load()
assert len(data) == 1
def test_exclude_as_string_converts_to_sequence() -> None:
loader = DirectoryLoader("./some_directory", exclude="*.py")
assert loader.exclude == ("*.py",)