community[minor]: add exclude parameter to DirectoryLoader (#17316)

- **Description:** adds an `exclude` parameter to the DirectoryLoader
class, based on similar behavior in GenericLoader
- **Issue:** discussed in
https://github.com/langchain-ai/langchain/discussions/9059 and I think
in some other issues that I cannot find at the moment 🙇
  - **Dependencies:** None
  - **Twitter handle:** don't have one sorry! Just https://github/nejch

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Nejc Habjan 2024-02-16 15:42:42 +01:00 committed by GitHub
parent 8f14234afb
commit b4fa847a90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 74 additions and 2 deletions

View File

@ -2,7 +2,7 @@ import concurrent
import logging
import random
from pathlib import Path
from typing import Any, List, Optional, Type, Union
from typing import Any, List, Optional, Sequence, Type, Union
from langchain_core.documents import Document
@ -41,6 +41,7 @@ class DirectoryLoader(BaseLoader):
use_multithreading: bool = False,
max_concurrency: int = 4,
*,
exclude: Union[Sequence[str], str] = (),
sample_size: int = 0,
randomize_sample: bool = False,
sample_seed: Union[int, None] = None,
@ -51,6 +52,8 @@ class DirectoryLoader(BaseLoader):
path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden).
exclude: A pattern or list of patterns to exclude from results.
Use glob syntax.
silent_errors: Whether to silently ignore errors. Defaults to False.
load_hidden: Whether to load hidden files. Defaults to False.
loader_cls: Loader class to use for loading files.
@ -64,11 +67,38 @@ class DirectoryLoader(BaseLoader):
directory.
randomize_sample: Shuffle the files to get a random sample.
sample_seed: set the seed of the random shuffle for reproducibility.
Examples:
.. code-block:: python
from langchain_community.document_loaders import DirectoryLoader
# Load all non-hidden files in a directory.
loader = DirectoryLoader("/path/to/directory")
# Load all text files in a directory without recursion.
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
# Recursively load all text files in a directory.
loader = DirectoryLoader(
"/path/to/directory", glob="*.txt", recursive=True
)
# Load all files in a directory, except for py files.
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
# Load all files in a directory, except for py or pyc files.
loader = DirectoryLoader(
"/path/to/directory", exclude=["*.py", "*.pyc"]
)
"""
if loader_kwargs is None:
loader_kwargs = {}
if isinstance(exclude, str):
exclude = (exclude,)
self.path = path
self.glob = glob
self.exclude = exclude
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.loader_kwargs = loader_kwargs
@ -118,7 +148,13 @@ class DirectoryLoader(BaseLoader):
raise ValueError(f"Expected directory, got file: '{self.path}'")
docs: List[Document] = []
items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
items = [
path
for path in paths
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
]
if self.sample_size > 0:
if self.randomize_sample:

View File

@ -1,4 +1,8 @@
from pathlib import Path
from typing import Any, List
import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader
@ -17,3 +21,35 @@ def test_raise_error_if_path_is_not_directory() -> None:
loader.load()
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
class CustomLoader:
"""Test loader. Mimics interface of existing file loader."""
def __init__(self, path: Path, **kwargs: Any) -> None:
"""Initialize the loader."""
self.path = path
def load(self) -> List[Document]:
"""Load documents."""
with open(self.path, "r") as f:
return [Document(page_content=f.read())]
def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
txt_file = tmp_path / "test.txt"
py_file = tmp_path / "test.py"
txt_file.touch()
py_file.touch()
loader = DirectoryLoader(
str(tmp_path),
exclude=["*.py"],
loader_cls=CustomLoader, # type: ignore
)
data = loader.load()
assert len(data) == 1
def test_exclude_as_string_converts_to_sequence() -> None:
loader = DirectoryLoader("./some_directory", exclude="*.py")
assert loader.exclude == ("*.py",)