community: glob multiple patterns when using DirectoryLoader (#22852)

- **Description:** Updated
*community.langchain_community.document_loaders.directory.py* to enable
the use of multiple glob patterns in the `DirectoryLoader` class. Now,
the glob parameter is of type `list[str] | str` and still defaults to
the same value as before. I updated the docstring of the class to
reflect this, and added a unit test to
*community.tests.unit_tests.document_loaders.test_directory.py* named
`test_directory_loader_glob_multiple`. This test also shows an example
of how to use the new functionality.
- ~~Issue:~~**Discussion Thread:**
https://github.com/langchain-ai/langchain/discussions/18559
- **Dependencies:** None
- **Twitter handle:** N/a

- [x] **Add tests and docs**
    - Added test (described above)
    - Updated class docstring

- [x] **Lint and test**

---------

Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
This commit is contained in:
Finlay Macklon
2024-06-18 10:24:50 -06:00
committed by GitHub
parent 5564d9e404
commit 616d06d7fe
2 changed files with 61 additions and 6 deletions

View File

@@ -2,7 +2,7 @@ import concurrent
import logging
import random
from pathlib import Path
from typing import Any, Callable, Iterator, List, Optional, Sequence, Type, Union
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Type, Union
from langchain_core.documents import Document
@@ -32,7 +32,7 @@ class DirectoryLoader(BaseLoader):
def __init__(
self,
path: str,
glob: str = "**/[!.]*",
glob: Union[List[str], Tuple[str], str] = "**/[!.]*",
silent_errors: bool = False,
load_hidden: bool = False,
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
@@ -51,8 +51,8 @@ class DirectoryLoader(BaseLoader):
Args:
path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden).
glob: A glob pattern or list of glob patterns to use to find files.
Defaults to "**/[!.]*" (all files except hidden).
exclude: A pattern or list of patterns to exclude from results.
Use glob syntax.
silent_errors: Whether to silently ignore errors. Defaults to False.
@@ -124,7 +124,20 @@ class DirectoryLoader(BaseLoader):
if not p.is_dir():
raise ValueError(f"Expected directory, got file: '{self.path}'")
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
# glob multiple patterns if a list is provided, e.g., multiple file extensions
if isinstance(self.glob, (list, tuple)):
paths = []
for pattern in self.glob:
paths.extend(
list(p.rglob(pattern) if self.recursive else p.glob(pattern))
)
elif isinstance(self.glob, str):
paths = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
else:
raise TypeError(
f"Expected glob to be str or sequence of str, but got {type(self.glob)}"
)
items = [
path
for path in paths