community: glob multiple patterns when using DirectoryLoader (#22852)

- **Description:** Updated
*community.langchain_community.document_loaders.directory.py* to enable
the use of multiple glob patterns in the `DirectoryLoader` class. Now,
the glob parameter is of type `list[str] | str` and still defaults to
the same value as before. I updated the docstring of the class to
reflect this, and added a unit test to
*community.tests.unit_tests.document_loaders.test_directory.py* named
`test_directory_loader_glob_multiple`. This test also shows an example
of how to use the new functionality.
- ~~Issue:~~**Discussion Thread:**
https://github.com/langchain-ai/langchain/discussions/18559
- **Dependencies:** None
- **Twitter handle:** N/a

- [x] **Add tests and docs**
    - Added test (described above)
    - Updated class docstring

- [x] **Lint and test**

---------

Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
This commit is contained in:
Finlay Macklon
2024-06-18 10:24:50 -06:00
committed by GitHub
parent 5564d9e404
commit 616d06d7fe
2 changed files with 61 additions and 6 deletions

View File

@@ -5,6 +5,7 @@ import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.text import TextLoader
def test_raise_error_if_path_not_exist() -> None:
@@ -23,7 +24,7 @@ def test_raise_error_if_path_is_not_directory() -> None:
assert str(e.value) == f"Expected directory, got file: '{__file__}'"
class CustomLoader:
class CustomLoader(TextLoader):
"""Test loader. Mimics interface of existing file loader."""
def __init__(self, path: Path, **kwargs: Any) -> None:
@@ -56,3 +57,44 @@ def test_exclude_ignores_matching_files(tmp_path: Path) -> None:
def test_exclude_as_string_converts_to_sequence() -> None:
loader = DirectoryLoader("./some_directory", exclude="*.py")
assert loader.exclude == ("*.py",)
class CustomLoaderMetadataOnly(CustomLoader):
"""Test loader that just returns the file path in metadata. For test_directory_loader_glob_multiple.""" # noqa: E501
def load(self) -> List[Document]:
metadata = {"source": self.path}
return [Document(page_content="", metadata=metadata)]
def lazy_load(self) -> Iterator[Document]:
return iter(self.load())
def test_directory_loader_glob_multiple() -> None:
"""Verify that globbing multiple patterns in a list works correctly."""
path_to_examples = "tests/examples/"
list_extensions = [".rst", ".txt"]
list_globs = [f"**/*{ext}" for ext in list_extensions]
is_file_type_loaded = {ext: False for ext in list_extensions}
loader = DirectoryLoader(
path=path_to_examples, glob=list_globs, loader_cls=CustomLoaderMetadataOnly
)
list_documents = loader.load()
for doc in list_documents:
path_doc = Path(doc.metadata.get("source", ""))
ext_doc = path_doc.suffix
if is_file_type_loaded.get(ext_doc, False):
continue
elif ext_doc in list_extensions:
is_file_type_loaded[ext_doc] = True
else:
# Loaded a filetype that was not specified in extensions list
assert False
for ext in list_extensions:
assert is_file_type_loaded.get(ext, False)