mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-17 23:41:46 +00:00
community: glob multiple patterns when using DirectoryLoader (#22852)
- **Description:** Updated *community.langchain_community.document_loaders.directory.py* to enable the use of multiple glob patterns in the `DirectoryLoader` class. Now, the glob parameter is of type `list[str] | str` and still defaults to the same value as before. I updated the docstring of the class to reflect this, and added a unit test to *community.tests.unit_tests.document_loaders.test_directory.py* named `test_directory_loader_glob_multiple`. This test also shows an example of how to use the new functionality. - ~~Issue:~~**Discussion Thread:** https://github.com/langchain-ai/langchain/discussions/18559 - **Dependencies:** None - **Twitter handle:** N/a - [x] **Add tests and docs** - Added test (described above) - Updated class docstring - [x] **Lint and test** --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders import DirectoryLoader
|
||||
from langchain_community.document_loaders.text import TextLoader
|
||||
|
||||
|
||||
def test_raise_error_if_path_not_exist() -> None:
|
||||
@@ -23,7 +24,7 @@ def test_raise_error_if_path_is_not_directory() -> None:
|
||||
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
|
||||
|
||||
|
||||
class CustomLoader:
|
||||
class CustomLoader(TextLoader):
|
||||
"""Test loader. Mimics interface of existing file loader."""
|
||||
|
||||
def __init__(self, path: Path, **kwargs: Any) -> None:
|
||||
@@ -56,3 +57,44 @@ def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
|
||||
def test_exclude_as_string_converts_to_sequence() -> None:
|
||||
loader = DirectoryLoader("./some_directory", exclude="*.py")
|
||||
assert loader.exclude == ("*.py",)
|
||||
|
||||
|
||||
class CustomLoaderMetadataOnly(CustomLoader):
|
||||
"""Test loader that just returns the file path in metadata. For test_directory_loader_glob_multiple.""" # noqa: E501
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
metadata = {"source": self.path}
|
||||
return [Document(page_content="", metadata=metadata)]
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
return iter(self.load())
|
||||
|
||||
|
||||
def test_directory_loader_glob_multiple() -> None:
|
||||
"""Verify that globbing multiple patterns in a list works correctly."""
|
||||
|
||||
path_to_examples = "tests/examples/"
|
||||
list_extensions = [".rst", ".txt"]
|
||||
list_globs = [f"**/*{ext}" for ext in list_extensions]
|
||||
is_file_type_loaded = {ext: False for ext in list_extensions}
|
||||
|
||||
loader = DirectoryLoader(
|
||||
path=path_to_examples, glob=list_globs, loader_cls=CustomLoaderMetadataOnly
|
||||
)
|
||||
|
||||
list_documents = loader.load()
|
||||
|
||||
for doc in list_documents:
|
||||
path_doc = Path(doc.metadata.get("source", ""))
|
||||
ext_doc = path_doc.suffix
|
||||
|
||||
if is_file_type_loaded.get(ext_doc, False):
|
||||
continue
|
||||
elif ext_doc in list_extensions:
|
||||
is_file_type_loaded[ext_doc] = True
|
||||
else:
|
||||
# Loaded a filetype that was not specified in extensions list
|
||||
assert False
|
||||
|
||||
for ext in list_extensions:
|
||||
assert is_file_type_loaded.get(ext, False)
|
||||
|
Reference in New Issue
Block a user