community[minor]: Implement DirectoryLoader lazy_load function (#19537)

Thank you for contributing to LangChain!

- [x] **PR title**: "community: Implement DirectoryLoader lazy_load
function"

- [x] **Description**: The `lazy_load` function of the `DirectoryLoader`
yields each document separately. If the given `loader_cls` of the
`DirectoryLoader` also implemented `lazy_load`, it will be used to yield
subdocuments of the file.

- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access:
`libs/community/tests/unit_tests/document_loaders/test_directory_loader.py`
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory:
`docs/docs/integrations/document_loaders/directory.ipynb`


- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
DasDingoCodes
2024-03-29 15:46:52 +01:00
committed by GitHub
parent 6b2b511f68
commit 73eb3f8fd9
3 changed files with 161 additions and 36 deletions

View File

@@ -2,17 +2,18 @@ import concurrent
import logging
import random
from pathlib import Path
from typing import Any, List, Optional, Sequence, Type, Union
from typing import Any, Callable, Iterator, List, Optional, Sequence, Type, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.html_bs import BSHTMLLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
FILE_LOADER_TYPE = Union[
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader], Type[CSVLoader]
]
logger = logging.getLogger(__name__)
@@ -111,44 +112,18 @@ class DirectoryLoader(BaseLoader):
self.randomize_sample = randomize_sample
self.sample_seed = sample_seed
def load_file(
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
) -> None:
"""Load a file.
Args:
item: File path.
path: Directory path.
docs: List of documents to append to.
pbar: Progress bar. Defaults to None.
"""
if item.is_file():
if _is_visible(item.relative_to(path)) or self.load_hidden:
try:
logger.debug(f"Processing file: {str(item)}")
sub_docs = self.loader_cls(str(item), **self.loader_kwargs).load()
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(f"Error loading file {str(item)}: {e}")
else:
logger.error(f"Error loading file {str(item)}")
raise e
finally:
if pbar:
pbar.update(1)
def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())
def lazy_load(self) -> Iterator[Document]:
"""Load documents lazily."""
p = Path(self.path)
if not p.exists():
raise FileNotFoundError(f"Directory not found: '{self.path}'")
if not p.is_dir():
raise ValueError(f"Expected directory, got file: '{self.path}'")
docs: List[Document] = []
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
items = [
path
@@ -185,15 +160,62 @@ class DirectoryLoader(BaseLoader):
)
if self.use_multithreading:
futures = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.max_concurrency
) as executor:
executor.map(lambda i: self.load_file(i, p, docs, pbar), items)
for i in items:
futures.append(
executor.submit(
self._lazy_load_file_to_non_generator(self._lazy_load_file),
i,
p,
pbar,
)
)
for future in concurrent.futures.as_completed(futures):
yield future.result()
else:
for i in items:
self.load_file(i, p, docs, pbar)
yield from self._lazy_load_file(i, p, pbar)
if pbar:
pbar.close()
return docs
def _lazy_load_file_to_non_generator(self, func: Callable) -> Callable:
def non_generator(item: Path, path: Path, pbar: Optional[Any]) -> List:
return [x for x in func(item, path, pbar)]
return non_generator
def _lazy_load_file(
self, item: Path, path: Path, pbar: Optional[Any]
) -> Iterator[Document]:
"""Load a file.
Args:
item: File path.
path: Directory path.
pbar: Progress bar. Defaults to None.
"""
if item.is_file():
if _is_visible(item.relative_to(path)) or self.load_hidden:
try:
logger.debug(f"Processing file: {str(item)}")
loader = self.loader_cls(str(item), **self.loader_kwargs)
try:
for subdoc in loader.lazy_load():
yield subdoc
except NotImplementedError:
for subdoc in loader.load():
yield subdoc
except Exception as e:
if self.silent_errors:
logger.warning(f"Error loading file {str(item)}: {e}")
else:
logger.error(f"Error loading file {str(item)}")
raise e
finally:
if pbar:
pbar.update(1)