mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 21:12:48 +00:00
community: glob multiple patterns when using DirectoryLoader (#22852)
- **Description:** Updated *community.langchain_community.document_loaders.directory.py* to enable the use of multiple glob patterns in the `DirectoryLoader` class. Now, the glob parameter is of type `list[str] | str` and still defaults to the same value as before. I updated the docstring of the class to reflect this, and added a unit test to *community.tests.unit_tests.document_loaders.test_directory.py* named `test_directory_loader_glob_multiple`. This test also shows an example of how to use the new functionality. - ~~Issue:~~**Discussion Thread:** https://github.com/langchain-ai/langchain/discussions/18559 - **Dependencies:** None - **Twitter handle:** N/a - [x] **Add tests and docs** - Added test (described above) - Updated class docstring - [x] **Lint and test** --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
This commit is contained in:
@@ -2,7 +2,7 @@ import concurrent
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Type, Union
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Type, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@@ -32,7 +32,7 @@ class DirectoryLoader(BaseLoader):
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
glob: str = "**/[!.]*",
|
||||
glob: Union[List[str], Tuple[str], str] = "**/[!.]*",
|
||||
silent_errors: bool = False,
|
||||
load_hidden: bool = False,
|
||||
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
|
||||
@@ -51,8 +51,8 @@ class DirectoryLoader(BaseLoader):
|
||||
|
||||
Args:
|
||||
path: Path to directory.
|
||||
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
||||
(all files except hidden).
|
||||
glob: A glob pattern or list of glob patterns to use to find files.
|
||||
Defaults to "**/[!.]*" (all files except hidden).
|
||||
exclude: A pattern or list of patterns to exclude from results.
|
||||
Use glob syntax.
|
||||
silent_errors: Whether to silently ignore errors. Defaults to False.
|
||||
@@ -124,7 +124,20 @@ class DirectoryLoader(BaseLoader):
|
||||
if not p.is_dir():
|
||||
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
||||
|
||||
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||
# glob multiple patterns if a list is provided, e.g., multiple file extensions
|
||||
if isinstance(self.glob, (list, tuple)):
|
||||
paths = []
|
||||
for pattern in self.glob:
|
||||
paths.extend(
|
||||
list(p.rglob(pattern) if self.recursive else p.glob(pattern))
|
||||
)
|
||||
elif isinstance(self.glob, str):
|
||||
paths = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Expected glob to be str or sequence of str, but got {type(self.glob)}"
|
||||
)
|
||||
|
||||
items = [
|
||||
path
|
||||
for path in paths
|
||||
|
Reference in New Issue
Block a user