diff --git a/langchain/document_loaders/generic.py b/langchain/document_loaders/generic.py index 4758ac5ed31..2475faa3da7 100644 --- a/langchain/document_loaders/generic.py +++ b/langchain/document_loaders/generic.py @@ -1,14 +1,15 @@ from __future__ import annotations from pathlib import Path -from typing import Iterator, Union, Optional, Sequence +from typing import Iterator, Union, Optional, Sequence, List from langchain.document_loaders.base import BaseLoader, BaseBlobParser from langchain.document_loaders.blob_loaders import FileSystemBlobLoader, BlobLoader +from langchain.document_loaders.parsers.registry import get_parser from langchain.schema import Document -from langchain.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain.text_splitter import TextSplitter -PathLike = Union[str, Path] +_PathLike = Union[str, Path] class GenericLoader(BaseLoader): @@ -70,25 +71,40 @@ class GenericLoader(BaseLoader): for blob in self.blob_loader.yield_blobs(): yield from self.blob_parser.lazy_parse(blob) + def load(self) -> List[Document]: + """Load all documents.""" + return list(self.lazy_load()) + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + """Load all documents and split them into sentences.""" + raise NotImplementedError( + "Loading and splitting is not yet implemented for generic loaders. " + "When they will be implemented they will be added via the initializer. " + "This method should not be used going forward." + ) + @classmethod def from_filesystem( cls, - path: PathLike, + path: _PathLike, *, glob: str = "**/[!.]*", suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, - blob_parser: , + parser: Union[str, BaseBlobParser] = "default", ) -> GenericLoader: """Create a generic document loader using a filesystem blob loader. Args: - blob_parser: A blob parser which knows how to parse blobs into documents + parser: A blob parser which knows how to parse blobs into documents path: The path to the directory to load documents from. glob: The glob pattern to use to find documents. suffixes: The suffixes to use to filter documents. If None, all files matching the glob will be loaded. show_progress: Whether to show a progress bar or not (requires tqdm). + Proxies to the file system loader. Returns: A generic document loader. @@ -96,5 +112,8 @@ class GenericLoader(BaseLoader): blob_loader = FileSystemBlobLoader( path, glob=glob, suffixes=suffixes, show_progress=show_progress ) - blob_parser = BaseBlobParser() + if isinstance(parser, str): + blob_parser = get_parser(parser) + else: + blob_parser = parser return cls(blob_loader, blob_parser) diff --git a/langchain/document_loaders/parsers/generic.py b/langchain/document_loaders/parsers/generic.py index f2458f7d78c..b5b137fdca5 100644 --- a/langchain/document_loaders/parsers/generic.py +++ b/langchain/document_loaders/parsers/generic.py @@ -2,7 +2,7 @@ This module contains some logic to help assemble more sophisticated parsers. """ -from typing import Iterator, Mapping, Optional +from typing import Iterator, Mapping, Optional, Any from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders.schema import Blob @@ -34,6 +34,7 @@ class MimeTypeBasedParser(BaseBlobParser): def __init__( self, handlers: Mapping[str, BaseBlobParser], + *, fallback_parser: Optional[BaseBlobParser] = None, ) -> None: """Define a parser that uses mime-types to determine how to parse a blob. diff --git a/langchain/document_loaders/parsers/registry.py b/langchain/document_loaders/parsers/registry.py new file mode 100644 index 00000000000..9c9aad83ec8 --- /dev/null +++ b/langchain/document_loaders/parsers/registry.py @@ -0,0 +1,30 @@ +"""Module includes a registry of default parser configurations.""" +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain.document_loaders.parsers.pdf import PyMuPDFParser +from langchain.document_loaders.parsers.txt import TextParser + + +def _get_default_parser() -> BaseBlobParser: + """Get default mime-type based parser.""" + return MimeTypeBasedParser( + handlers={ + "application/pdf": PyMuPDFParser(), + "text/plain": TextParser(), + }, + fallback_parser=None, + ) + + +_REGISTRY = { + "default": _get_default_parser, +} + +# PUBLIC API + + +def get_parser(parser_name: str) -> BaseBlobParser: + """Get a parser by parser name.""" + if parser_name not in _REGISTRY: + raise ValueError(f"Unknown parser combination: {parser_name}") + return _REGISTRY[parser_name]() diff --git a/langchain/document_loaders/parsers/txt.py b/langchain/document_loaders/parsers/txt.py new file mode 100644 index 00000000000..58bed56804e --- /dev/null +++ b/langchain/document_loaders/parsers/txt.py @@ -0,0 +1,12 @@ +"""Module for parsing text files..""" +from typing import Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document + + +class TextParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + yield Document(page_content=blob.as_string(), metadata={"source": blob.source}) diff --git a/tests/unit_tests/document_loaders/test_generic_loader.py b/tests/unit_tests/document_loaders/test_generic_loader.py new file mode 100644 index 00000000000..298aa509820 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_generic_loader.py @@ -0,0 +1,114 @@ +"""Test generic loader.""" +import os +import pytest +import tempfile +from pathlib import Path +from typing import Generator, Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.generic import GenericLoader +from langchain.document_loaders.blob_loaders import FileSystemBlobLoader +from langchain.schema import Document + + +@pytest.fixture +def toy_dir() -> Generator[Path, None, None]: + """Yield a pre-populated directory to test the blob loader.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create test.txt + with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt: + test_txt.write("This is a test.txt file.") + + # Create test.html + with open(os.path.join(temp_dir, "test.html"), "w") as test_html: + test_html.write( + "