diff --git a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py index ac1812f7957..0eea233741c 100644 --- a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py +++ b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py @@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader): """Initialize with a path to directory and how to glob over it. Args: - path: Path to directory to load from + path: Path to directory to load from or path to file to load. + If a path to a file is provided, glob/exclude/suffixes are ignored. glob: Glob pattern relative to the specified path by default set to pick up all non-hidden files exclude: patterns to exclude from results, use glob syntax @@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader): Examples: .. code-block:: python + from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + + # Load a single file. + loader = FileSystemBlobLoader("/path/to/file.txt") # Recursively load all text files in a directory. loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") @@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader): def _yield_paths(self) -> Iterable[Path]: """Yield paths that match the requested pattern.""" + if self.path.is_file(): + yield self.path + return + paths = self.path.glob(self.glob) for path in paths: if self.exclude: diff --git a/libs/langchain/langchain/document_loaders/concurrent.py b/libs/langchain/langchain/document_loaders/concurrent.py index 5870c794e45..3d124aeac9a 100644 --- a/libs/langchain/langchain/document_loaders/concurrent.py +++ b/libs/langchain/langchain/document_loaders/concurrent.py @@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader): show_progress: bool = False, parser: Union[DEFAULT, BaseBlobParser] = "default", num_workers: int = 4, + parser_kwargs: Optional[dict] = None, ) -> ConcurrentLoader: - """ - Create a concurrent generic document loader using a - filesystem blob loader. - + """Create a concurrent generic document loader using a filesystem blob loader. Args: path: The path to the directory to load documents from. @@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader): Proxies to the file system loader. parser: A blob parser which knows how to parse blobs into documents num_workers: Max number of concurrent workers to use. + parser_kwargs: Keyword arguments to pass to the parser. """ blob_loader = FileSystemBlobLoader( path, @@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader): show_progress=show_progress, ) if isinstance(parser, str): - blob_parser = get_parser(parser) + if parser == "default" and cls.get_parser != GenericLoader.get_parser: + # There is an implementation of get_parser on the class, use it. + blob_parser = cls.get_parser(**(parser_kwargs or {})) + else: + blob_parser = get_parser(parser) else: blob_parser = parser return cls(blob_loader, blob_parser, num_workers=num_workers) diff --git a/libs/langchain/langchain/document_loaders/generic.py b/libs/langchain/langchain/document_loaders/generic.py index 88d262faee6..0a3cb16ba71 100644 --- a/libs/langchain/langchain/document_loaders/generic.py +++ b/libs/langchain/langchain/document_loaders/generic.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import Iterator, List, Literal, Optional, Sequence, Union +from typing import Any, Iterator, List, Literal, Optional, Sequence, Union from langchain_core.documents import Document @@ -23,46 +23,61 @@ class GenericLoader(BaseLoader): Examples: - .. code-block:: python - - from langchain.document_loaders import GenericLoader - from langchain.document_loaders.blob_loaders import FileSystemBlobLoader - - loader = GenericLoader.from_filesystem( - path="path/to/directory", - glob="**/[!.]*", - suffixes=[".pdf"], - show_progress=True, - ) - - docs = loader.lazy_load() - next(docs) - - Example instantiations to change which files are loaded: - - .. code-block:: python - - # Recursively load all text files in a directory. - loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt") - - # Recursively load all non-hidden files in a directory. - loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*") - - # Load all files in a directory without recursion. - loader = GenericLoader.from_filesystem("/path/to/dir", glob="*") - - Example instantiations to change which parser is used: + Parse a specific PDF file: .. code-block:: python + from langchain.document_loaders import GenericLoader from langchain.document_loaders.parsers.pdf import PyPDFParser # Recursively load all text files in a directory. loader = GenericLoader.from_filesystem( - "/path/to/dir", - glob="**/*.pdf", + "my_lovely_pdf.pdf", parser=PyPDFParser() ) + + .. code-block:: python + + from langchain.document_loaders import GenericLoader + from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + + + loader = GenericLoader.from_filesystem( + path="path/to/directory", + glob="**/[!.]*", + suffixes=[".pdf"], + show_progress=True, + ) + + docs = loader.lazy_load() + next(docs) + + Example instantiations to change which files are loaded: + + .. code-block:: python + + # Recursively load all text files in a directory. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt") + + # Recursively load all non-hidden files in a directory. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*") + + # Load all files in a directory without recursion. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="*") + + Example instantiations to change which parser is used: + + .. code-block:: python + + from langchain.document_loaders.parsers.pdf import PyPDFParser + + # Recursively load all text files in a directory. + loader = GenericLoader.from_filesystem( + "/path/to/dir", + glob="**/*.pdf", + parser=PyPDFParser() + ) + """ def __init__( @@ -110,18 +125,26 @@ class GenericLoader(BaseLoader): suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, parser: Union[DEFAULT, BaseBlobParser] = "default", + parser_kwargs: Optional[dict] = None, ) -> GenericLoader: """Create a generic document loader using a filesystem blob loader. Args: - path: The path to the directory to load documents from. + path: The path to the directory to load documents from OR the path to a + single file to load. If this is a file, glob, exclude, suffixes + will be ignored. glob: The glob pattern to use to find documents. suffixes: The suffixes to use to filter documents. If None, all files matching the glob will be loaded. exclude: A list of patterns to exclude from the loader. show_progress: Whether to show a progress bar or not (requires tqdm). Proxies to the file system loader. - parser: A blob parser which knows how to parse blobs into documents + parser: A blob parser which knows how to parse blobs into documents, + will instantiate a default parser if not provided. + The default can be overridden by either passing a parser or + setting the class attribute `blob_parser` (the latter + should be used with inheritance). + parser_kwargs: Keyword arguments to pass to the parser. Returns: A generic document loader. @@ -134,7 +157,20 @@ class GenericLoader(BaseLoader): show_progress=show_progress, ) if isinstance(parser, str): - blob_parser = get_parser(parser) + if parser == "default": + try: + # If there is an implementation of get_parser on the class, use it. + blob_parser = cls.get_parser(**(parser_kwargs or {})) + except NotImplementedError: + # if not then use the global registry. + blob_parser = get_parser(parser) + else: + blob_parser = get_parser(parser) else: blob_parser = parser return cls(blob_loader, blob_parser) + + @staticmethod + def get_parser(**kwargs: Any) -> BaseBlobParser: + """Override this method to associate a default parser with the class.""" + raise NotImplementedError() diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py b/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py index 5603f87c1e5..20f82155650 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py @@ -2,7 +2,7 @@ import os import tempfile from pathlib import Path -from typing import Generator, Iterator +from typing import Any, Generator, Iterator import pytest from langchain_core.documents import Document @@ -10,6 +10,7 @@ from langchain_core.documents import Document from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader from langchain.document_loaders.generic import GenericLoader +from langchain.document_loaders.parsers.txt import TextParser @pytest.fixture @@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None: assert docs[0].page_content == "This is a test.txt file." +def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None: + loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt")) + docs = loader.load() + assert len(docs) == 1 + assert docs[0].page_content == "This is a test.txt file." + + def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None: """Test that glob parameter is taken into account.""" loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser()) @@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None: # Glob order seems to be deterministic with recursion. If this test becomes flaky, # we can sort the docs by page content. assert docs[0].page_content == "This is a test.txt file." + + +def test_specifying_parser_via_class_attribute(toy_dir: str) -> None: + class TextLoader(GenericLoader): + """Parser created for testing purposes.""" + + @staticmethod + def get_parser(**kwargs: Any) -> BaseBlobParser: + return TextParser() + + loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"]) + docs = loader.load() + assert len(docs) == 3 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file."