mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-04 04:28:58 +00:00
community[minor]: Implement DirectoryLoader lazy_load function (#19537)
Thank you for contributing to LangChain! - [x] **PR title**: "community: Implement DirectoryLoader lazy_load function" - [x] **Description**: The `lazy_load` function of the `DirectoryLoader` yields each document separately. If the given `loader_cls` of the `DirectoryLoader` also implemented `lazy_load`, it will be used to yield subdocuments of the file. - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access: `libs/community/tests/unit_tests/document_loaders/test_directory_loader.py` 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory: `docs/docs/integrations/document_loaders/directory.ipynb` - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
@@ -2,17 +2,18 @@ import concurrent
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Sequence, Type, Union
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Type, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.document_loaders.csv_loader import CSVLoader
|
||||
from langchain_community.document_loaders.html_bs import BSHTMLLoader
|
||||
from langchain_community.document_loaders.text import TextLoader
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
FILE_LOADER_TYPE = Union[
|
||||
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
|
||||
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader], Type[CSVLoader]
|
||||
]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -111,44 +112,18 @@ class DirectoryLoader(BaseLoader):
|
||||
self.randomize_sample = randomize_sample
|
||||
self.sample_seed = sample_seed
|
||||
|
||||
def load_file(
|
||||
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
|
||||
) -> None:
|
||||
"""Load a file.
|
||||
|
||||
Args:
|
||||
item: File path.
|
||||
path: Directory path.
|
||||
docs: List of documents to append to.
|
||||
pbar: Progress bar. Defaults to None.
|
||||
|
||||
"""
|
||||
if item.is_file():
|
||||
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
||||
try:
|
||||
logger.debug(f"Processing file: {str(item)}")
|
||||
sub_docs = self.loader_cls(str(item), **self.loader_kwargs).load()
|
||||
docs.extend(sub_docs)
|
||||
except Exception as e:
|
||||
if self.silent_errors:
|
||||
logger.warning(f"Error loading file {str(item)}: {e}")
|
||||
else:
|
||||
logger.error(f"Error loading file {str(item)}")
|
||||
raise e
|
||||
finally:
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load documents lazily."""
|
||||
p = Path(self.path)
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"Directory not found: '{self.path}'")
|
||||
if not p.is_dir():
|
||||
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
||||
|
||||
docs: List[Document] = []
|
||||
|
||||
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||
items = [
|
||||
path
|
||||
@@ -185,15 +160,62 @@ class DirectoryLoader(BaseLoader):
|
||||
)
|
||||
|
||||
if self.use_multithreading:
|
||||
futures = []
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=self.max_concurrency
|
||||
) as executor:
|
||||
executor.map(lambda i: self.load_file(i, p, docs, pbar), items)
|
||||
for i in items:
|
||||
futures.append(
|
||||
executor.submit(
|
||||
self._lazy_load_file_to_non_generator(self._lazy_load_file),
|
||||
i,
|
||||
p,
|
||||
pbar,
|
||||
)
|
||||
)
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
yield future.result()
|
||||
else:
|
||||
for i in items:
|
||||
self.load_file(i, p, docs, pbar)
|
||||
yield from self._lazy_load_file(i, p, pbar)
|
||||
|
||||
if pbar:
|
||||
pbar.close()
|
||||
|
||||
return docs
|
||||
def _lazy_load_file_to_non_generator(self, func: Callable) -> Callable:
|
||||
def non_generator(item: Path, path: Path, pbar: Optional[Any]) -> List:
|
||||
return [x for x in func(item, path, pbar)]
|
||||
|
||||
return non_generator
|
||||
|
||||
def _lazy_load_file(
|
||||
self, item: Path, path: Path, pbar: Optional[Any]
|
||||
) -> Iterator[Document]:
|
||||
"""Load a file.
|
||||
|
||||
Args:
|
||||
item: File path.
|
||||
path: Directory path.
|
||||
pbar: Progress bar. Defaults to None.
|
||||
|
||||
"""
|
||||
if item.is_file():
|
||||
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
||||
try:
|
||||
logger.debug(f"Processing file: {str(item)}")
|
||||
loader = self.loader_cls(str(item), **self.loader_kwargs)
|
||||
try:
|
||||
for subdoc in loader.lazy_load():
|
||||
yield subdoc
|
||||
except NotImplementedError:
|
||||
for subdoc in loader.load():
|
||||
yield subdoc
|
||||
except Exception as e:
|
||||
if self.silent_errors:
|
||||
logger.warning(f"Error loading file {str(item)}: {e}")
|
||||
else:
|
||||
logger.error(f"Error loading file {str(item)}")
|
||||
raise e
|
||||
finally:
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
|
Reference in New Issue
Block a user