mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 05:43:55 +00:00
community[minor]: add exclude parameter to DirectoryLoader (#17316)
- **Description:** adds an `exclude` parameter to the DirectoryLoader class, based on similar behavior in GenericLoader - **Issue:** discussed in https://github.com/langchain-ai/langchain/discussions/9059 and I think in some other issues that I cannot find at the moment 🙇 - **Dependencies:** None - **Twitter handle:** don't have one sorry! Just https://github/nejch --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
8f14234afb
commit
b4fa847a90
@ -2,7 +2,7 @@ import concurrent
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Type, Union
|
||||
from typing import Any, List, Optional, Sequence, Type, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@ -41,6 +41,7 @@ class DirectoryLoader(BaseLoader):
|
||||
use_multithreading: bool = False,
|
||||
max_concurrency: int = 4,
|
||||
*,
|
||||
exclude: Union[Sequence[str], str] = (),
|
||||
sample_size: int = 0,
|
||||
randomize_sample: bool = False,
|
||||
sample_seed: Union[int, None] = None,
|
||||
@ -51,6 +52,8 @@ class DirectoryLoader(BaseLoader):
|
||||
path: Path to directory.
|
||||
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
||||
(all files except hidden).
|
||||
exclude: A pattern or list of patterns to exclude from results.
|
||||
Use glob syntax.
|
||||
silent_errors: Whether to silently ignore errors. Defaults to False.
|
||||
load_hidden: Whether to load hidden files. Defaults to False.
|
||||
loader_cls: Loader class to use for loading files.
|
||||
@ -64,11 +67,38 @@ class DirectoryLoader(BaseLoader):
|
||||
directory.
|
||||
randomize_sample: Shuffle the files to get a random sample.
|
||||
sample_seed: set the seed of the random shuffle for reproducibility.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
from langchain_community.document_loaders import DirectoryLoader
|
||||
|
||||
# Load all non-hidden files in a directory.
|
||||
loader = DirectoryLoader("/path/to/directory")
|
||||
|
||||
# Load all text files in a directory without recursion.
|
||||
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = DirectoryLoader(
|
||||
"/path/to/directory", glob="*.txt", recursive=True
|
||||
)
|
||||
|
||||
# Load all files in a directory, except for py files.
|
||||
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
|
||||
|
||||
# Load all files in a directory, except for py or pyc files.
|
||||
loader = DirectoryLoader(
|
||||
"/path/to/directory", exclude=["*.py", "*.pyc"]
|
||||
)
|
||||
"""
|
||||
if loader_kwargs is None:
|
||||
loader_kwargs = {}
|
||||
if isinstance(exclude, str):
|
||||
exclude = (exclude,)
|
||||
self.path = path
|
||||
self.glob = glob
|
||||
self.exclude = exclude
|
||||
self.load_hidden = load_hidden
|
||||
self.loader_cls = loader_cls
|
||||
self.loader_kwargs = loader_kwargs
|
||||
@ -118,7 +148,13 @@ class DirectoryLoader(BaseLoader):
|
||||
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
||||
|
||||
docs: List[Document] = []
|
||||
items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
|
||||
|
||||
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||
items = [
|
||||
path
|
||||
for path in paths
|
||||
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
|
||||
]
|
||||
|
||||
if self.sample_size > 0:
|
||||
if self.randomize_sample:
|
||||
|
@ -1,4 +1,8 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders import DirectoryLoader
|
||||
|
||||
@ -17,3 +21,35 @@ def test_raise_error_if_path_is_not_directory() -> None:
|
||||
loader.load()
|
||||
|
||||
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
|
||||
|
||||
|
||||
class CustomLoader:
|
||||
"""Test loader. Mimics interface of existing file loader."""
|
||||
|
||||
def __init__(self, path: Path, **kwargs: Any) -> None:
|
||||
"""Initialize the loader."""
|
||||
self.path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
with open(self.path, "r") as f:
|
||||
return [Document(page_content=f.read())]
|
||||
|
||||
|
||||
def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
|
||||
txt_file = tmp_path / "test.txt"
|
||||
py_file = tmp_path / "test.py"
|
||||
txt_file.touch()
|
||||
py_file.touch()
|
||||
loader = DirectoryLoader(
|
||||
str(tmp_path),
|
||||
exclude=["*.py"],
|
||||
loader_cls=CustomLoader, # type: ignore
|
||||
)
|
||||
data = loader.load()
|
||||
assert len(data) == 1
|
||||
|
||||
|
||||
def test_exclude_as_string_converts_to_sequence() -> None:
|
||||
loader = DirectoryLoader("./some_directory", exclude="*.py")
|
||||
assert loader.exclude == ("*.py",)
|
||||
|
Loading…
Reference in New Issue
Block a user