Improve file system blob loader and generic loader (#14004)

* Add support for passing a specific file to the file system blob loader * Allow specifying a class parameter for the parser for the generic loader ```python class AudioLoader(GenericLoader): @staticmethod def get_parser(**kwargs): return MyAudioParser(**kwargs): ``` The intent of the GenericLoader is to provide on-ramps from different sources (e.g., web, s3, file system). An alternative is to use pipelining syntax or creating a Pipeline ``` FileSystemBlobLoader(...) | MyAudioParser ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-10 07:21:03 +00:00 · 2023-12-01 21:23:40 -05:00
parent 700428593a
commit 6d0209e0aa
4 changed files with 114 additions and 42 deletions
--- a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
+++ b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
        """Initialize with a path to directory and how to glob over it.
        Args:
-            path: Path to directory to load from
+            path: Path to directory to load from or path to file to load.
                  If a path to a file is provided, glob/exclude/suffixes are ignored.
            glob: Glob pattern relative to the specified path
                  by default set to pick up all non-hidden files
            exclude: patterns to exclude from results, use glob syntax
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
        Examples:
            .. code-block:: python
                from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
                # Load a single file.
                loader = FileSystemBlobLoader("/path/to/file.txt")
                # Recursively load all text files in a directory.
                loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):
    def _yield_paths(self) -> Iterable[Path]:
        """Yield paths that match the requested pattern."""
        if self.path.is_file():
            yield self.path
            return
        paths = self.path.glob(self.glob)
        for path in paths:
            if self.exclude:
--- a/libs/langchain/langchain/document_loaders/concurrent.py
+++ b/libs/langchain/langchain/document_loaders/concurrent.py
@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
        show_progress: bool = False,
        parser: Union[DEFAULT, BaseBlobParser] = "default",
        num_workers: int = 4,
        parser_kwargs: Optional[dict] = None,
    ) -> ConcurrentLoader:
-        """
+        """Create a concurrent generic document loader using a filesystem blob loader.
        Create a concurrent generic document loader using a
        filesystem blob loader.
        Args:
            path: The path to the directory to load documents from.
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
                           Proxies to the file system loader.
            parser: A blob parser which knows how to parse blobs into documents
            num_workers: Max number of concurrent workers to use.
            parser_kwargs: Keyword arguments to pass to the parser.
        """
        blob_loader = FileSystemBlobLoader(
            path,
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
            show_progress=show_progress,
        )
        if isinstance(parser, str):
-            blob_parser = get_parser(parser)
+            if parser == "default" and cls.get_parser != GenericLoader.get_parser:
                # There is an implementation of get_parser on the class, use it.
                blob_parser = cls.get_parser(**(parser_kwargs or {}))
            else:
                blob_parser = get_parser(parser)
        else:
            blob_parser = parser
        return cls(blob_loader, blob_parser, num_workers=num_workers)
--- a/libs/langchain/langchain/document_loaders/generic.py
+++ b/libs/langchain/langchain/document_loaders/generic.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from pathlib import Path
-from typing import Iterator, List, Literal, Optional, Sequence, Union
+from typing import Any, Iterator, List, Literal, Optional, Sequence, Union
 from langchain_core.documents import Document
@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):
    Examples:
-       .. code-block:: python
+        Parse a specific PDF file:
        from langchain.document_loaders import GenericLoader
        from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
        loader = GenericLoader.from_filesystem(
            path="path/to/directory",
            glob="**/[!.]*",
            suffixes=[".pdf"],
            show_progress=True,
        )
        docs = loader.lazy_load()
        next(docs)
        Example instantiations to change which files are loaded:
        .. code-block:: python
            # Recursively load all text files in a directory.
            loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
            # Recursively load all non-hidden files in a directory.
            loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
            # Load all files in a directory without recursion.
            loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
        Example instantiations to change which parser is used:
        .. code-block:: python
            from langchain.document_loaders import GenericLoader
            from langchain.document_loaders.parsers.pdf import PyPDFParser
            # Recursively load all text files in a directory.
            loader = GenericLoader.from_filesystem(
-                "/path/to/dir",
+                "my_lovely_pdf.pdf",
                glob="**/*.pdf",
                parser=PyPDFParser()
            )
       .. code-block:: python
            from langchain.document_loaders import GenericLoader
            from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
            loader = GenericLoader.from_filesystem(
                path="path/to/directory",
                glob="**/[!.]*",
                suffixes=[".pdf"],
                show_progress=True,
            )
            docs = loader.lazy_load()
            next(docs)
    Example instantiations to change which files are loaded:
    .. code-block:: python
        # Recursively load all text files in a directory.
        loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
        # Recursively load all non-hidden files in a directory.
        loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
        # Load all files in a directory without recursion.
        loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
    Example instantiations to change which parser is used:
    .. code-block:: python
        from langchain.document_loaders.parsers.pdf import PyPDFParser
        # Recursively load all text files in a directory.
        loader = GenericLoader.from_filesystem(
            "/path/to/dir",
            glob="**/*.pdf",
            parser=PyPDFParser()
        )
    """
    def __init__(
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
        suffixes: Optional[Sequence[str]] = None,
        show_progress: bool = False,
        parser: Union[DEFAULT, BaseBlobParser] = "default",
        parser_kwargs: Optional[dict] = None,
    ) -> GenericLoader:
        """Create a generic document loader using a filesystem blob loader.
        Args:
-            path: The path to the directory to load documents from.
+            path: The path to the directory to load documents from OR the path to a
                  single file to load. If this is a file, glob, exclude, suffixes
                    will be ignored.
            glob: The glob pattern to use to find documents.
            suffixes: The suffixes to use to filter documents. If None, all files
                      matching the glob will be loaded.
            exclude: A list of patterns to exclude from the loader.
            show_progress: Whether to show a progress bar or not (requires tqdm).
                           Proxies to the file system loader.
-            parser: A blob parser which knows how to parse blobs into documents
+            parser: A blob parser which knows how to parse blobs into documents,
                    will instantiate a default parser if not provided.
                    The default can be overridden by either passing a parser or
                    setting the class attribute `blob_parser` (the latter
                    should be used with inheritance).
            parser_kwargs: Keyword arguments to pass to the parser.
        Returns:
            A generic document loader.
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
            show_progress=show_progress,
        )
        if isinstance(parser, str):
-            blob_parser = get_parser(parser)
+            if parser == "default":
                try:
                    # If there is an implementation of get_parser on the class, use it.
                    blob_parser = cls.get_parser(**(parser_kwargs or {}))
                except NotImplementedError:
                    # if not then use the global registry.
                    blob_parser = get_parser(parser)
            else:
                blob_parser = get_parser(parser)
        else:
            blob_parser = parser
        return cls(blob_loader, blob_parser)
    @staticmethod
    def get_parser(**kwargs: Any) -> BaseBlobParser:
        """Override this method to associate a default parser with the class."""
        raise NotImplementedError()
--- a/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py
@@ -2,7 +2,7 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Generator, Iterator
+from typing import Any, Generator, Iterator
 import pytest
 from langchain_core.documents import Document
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
 from langchain.document_loaders.generic import GenericLoader
 from langchain.document_loaders.parsers.txt import TextParser
@pytest.fixture
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
    assert docs[0].page_content == "This is a test.txt file."
 def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
    loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
    docs = loader.load()
    assert len(docs) == 1
    assert docs[0].page_content == "This is a test.txt file."
 def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
    """Test that glob parameter is taken into account."""
    loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
    # Glob order seems to be deterministic with recursion. If this test becomes flaky,
    # we can sort the docs by page content.
    assert docs[0].page_content == "This is a test.txt file."
 def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
    class TextLoader(GenericLoader):
        """Parser created for testing purposes."""
        @staticmethod
        def get_parser(**kwargs: Any) -> BaseBlobParser:
            return TextParser()
    loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
    docs = loader.load()
    assert len(docs) == 3
    # Glob order seems to be deterministic with recursion. If this test becomes flaky,
    # we can sort the docs by page content.
    assert docs[0].page_content == "This is a test.txt file."