mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
community[minor]: add exclude parameter to DirectoryLoader (#17316)
- **Description:** adds an `exclude` parameter to the DirectoryLoader class, based on similar behavior in GenericLoader - **Issue:** discussed in https://github.com/langchain-ai/langchain/discussions/9059 and I think in some other issues that I cannot find at the moment 🙇 - **Dependencies:** None - **Twitter handle:** don't have one sorry! Just https://github/nejch --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
8f14234afb
commit
b4fa847a90
@ -2,7 +2,7 @@ import concurrent
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Type, Union
|
from typing import Any, List, Optional, Sequence, Type, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -41,6 +41,7 @@ class DirectoryLoader(BaseLoader):
|
|||||||
use_multithreading: bool = False,
|
use_multithreading: bool = False,
|
||||||
max_concurrency: int = 4,
|
max_concurrency: int = 4,
|
||||||
*,
|
*,
|
||||||
|
exclude: Union[Sequence[str], str] = (),
|
||||||
sample_size: int = 0,
|
sample_size: int = 0,
|
||||||
randomize_sample: bool = False,
|
randomize_sample: bool = False,
|
||||||
sample_seed: Union[int, None] = None,
|
sample_seed: Union[int, None] = None,
|
||||||
@ -51,6 +52,8 @@ class DirectoryLoader(BaseLoader):
|
|||||||
path: Path to directory.
|
path: Path to directory.
|
||||||
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
||||||
(all files except hidden).
|
(all files except hidden).
|
||||||
|
exclude: A pattern or list of patterns to exclude from results.
|
||||||
|
Use glob syntax.
|
||||||
silent_errors: Whether to silently ignore errors. Defaults to False.
|
silent_errors: Whether to silently ignore errors. Defaults to False.
|
||||||
load_hidden: Whether to load hidden files. Defaults to False.
|
load_hidden: Whether to load hidden files. Defaults to False.
|
||||||
loader_cls: Loader class to use for loading files.
|
loader_cls: Loader class to use for loading files.
|
||||||
@ -64,11 +67,38 @@ class DirectoryLoader(BaseLoader):
|
|||||||
directory.
|
directory.
|
||||||
randomize_sample: Shuffle the files to get a random sample.
|
randomize_sample: Shuffle the files to get a random sample.
|
||||||
sample_seed: set the seed of the random shuffle for reproducibility.
|
sample_seed: set the seed of the random shuffle for reproducibility.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
from langchain_community.document_loaders import DirectoryLoader
|
||||||
|
|
||||||
|
# Load all non-hidden files in a directory.
|
||||||
|
loader = DirectoryLoader("/path/to/directory")
|
||||||
|
|
||||||
|
# Load all text files in a directory without recursion.
|
||||||
|
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
|
||||||
|
|
||||||
|
# Recursively load all text files in a directory.
|
||||||
|
loader = DirectoryLoader(
|
||||||
|
"/path/to/directory", glob="*.txt", recursive=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load all files in a directory, except for py files.
|
||||||
|
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
|
||||||
|
|
||||||
|
# Load all files in a directory, except for py or pyc files.
|
||||||
|
loader = DirectoryLoader(
|
||||||
|
"/path/to/directory", exclude=["*.py", "*.pyc"]
|
||||||
|
)
|
||||||
"""
|
"""
|
||||||
if loader_kwargs is None:
|
if loader_kwargs is None:
|
||||||
loader_kwargs = {}
|
loader_kwargs = {}
|
||||||
|
if isinstance(exclude, str):
|
||||||
|
exclude = (exclude,)
|
||||||
self.path = path
|
self.path = path
|
||||||
self.glob = glob
|
self.glob = glob
|
||||||
|
self.exclude = exclude
|
||||||
self.load_hidden = load_hidden
|
self.load_hidden = load_hidden
|
||||||
self.loader_cls = loader_cls
|
self.loader_cls = loader_cls
|
||||||
self.loader_kwargs = loader_kwargs
|
self.loader_kwargs = loader_kwargs
|
||||||
@ -118,7 +148,13 @@ class DirectoryLoader(BaseLoader):
|
|||||||
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
||||||
|
|
||||||
docs: List[Document] = []
|
docs: List[Document] = []
|
||||||
items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
|
|
||||||
|
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||||
|
items = [
|
||||||
|
path
|
||||||
|
for path in paths
|
||||||
|
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
|
||||||
|
]
|
||||||
|
|
||||||
if self.sample_size > 0:
|
if self.sample_size > 0:
|
||||||
if self.randomize_sample:
|
if self.randomize_sample:
|
||||||
|
@ -1,4 +1,8 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders import DirectoryLoader
|
from langchain_community.document_loaders import DirectoryLoader
|
||||||
|
|
||||||
@ -17,3 +21,35 @@ def test_raise_error_if_path_is_not_directory() -> None:
|
|||||||
loader.load()
|
loader.load()
|
||||||
|
|
||||||
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
|
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
|
||||||
|
|
||||||
|
|
||||||
|
class CustomLoader:
|
||||||
|
"""Test loader. Mimics interface of existing file loader."""
|
||||||
|
|
||||||
|
def __init__(self, path: Path, **kwargs: Any) -> None:
|
||||||
|
"""Initialize the loader."""
|
||||||
|
self.path = path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
with open(self.path, "r") as f:
|
||||||
|
return [Document(page_content=f.read())]
|
||||||
|
|
||||||
|
|
||||||
|
def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
|
||||||
|
txt_file = tmp_path / "test.txt"
|
||||||
|
py_file = tmp_path / "test.py"
|
||||||
|
txt_file.touch()
|
||||||
|
py_file.touch()
|
||||||
|
loader = DirectoryLoader(
|
||||||
|
str(tmp_path),
|
||||||
|
exclude=["*.py"],
|
||||||
|
loader_cls=CustomLoader, # type: ignore
|
||||||
|
)
|
||||||
|
data = loader.load()
|
||||||
|
assert len(data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_exclude_as_string_converts_to_sequence() -> None:
|
||||||
|
loader = DirectoryLoader("./some_directory", exclude="*.py")
|
||||||
|
assert loader.exclude == ("*.py",)
|
||||||
|
Loading…
Reference in New Issue
Block a user