Improve file system blob loader and generic loader (#14004)

* Add support for passing a specific file to the file system blob loader * Allow specifying a class parameter for the parser for the generic loader ```python class AudioLoader(GenericLoader): @staticmethod def get_parser(**kwargs): return MyAudioParser(**kwargs): ``` The intent of the GenericLoader is to provide on-ramps from different sources (e.g., web, s3, file system). An alternative is to use pipelining syntax or creating a Pipeline ``` FileSystemBlobLoader(...) | MyAudioParser ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-08 22:42:05 +00:00 · 2023-12-01 21:23:40 -05:00
parent 700428593a
commit 6d0209e0aa
4 changed files with 114 additions and 42 deletions
--- a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
+++ b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
        """Initialize with a path to directory and how to glob over it.

        Args:
-            path: Path to directory to load from
+            path: Path to directory to load from or path to file to load.
+                  If a path to a file is provided, glob/exclude/suffixes are ignored.
            glob: Glob pattern relative to the specified path
                  by default set to pick up all non-hidden files
            exclude: patterns to exclude from results, use glob syntax
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
        Examples:

            .. code-block:: python
+                from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
+
+                # Load a single file.
+                loader = FileSystemBlobLoader("/path/to/file.txt")

                # Recursively load all text files in a directory.
                loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):

    def _yield_paths(self) -> Iterable[Path]:
        """Yield paths that match the requested pattern."""
+        if self.path.is_file():
+            yield self.path
+            return
+
        paths = self.path.glob(self.glob)
        for path in paths:
            if self.exclude:
--- a/libs/langchain/langchain/document_loaders/concurrent.py
+++ b/libs/langchain/langchain/document_loaders/concurrent.py
@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
        show_progress: bool = False,
        parser: Union[DEFAULT, BaseBlobParser] = "default",
        num_workers: int = 4,
+        parser_kwargs: Optional[dict] = None,
    ) -> ConcurrentLoader:
-        """
-        Create a concurrent generic document loader using a
-        filesystem blob loader.
-
+        """Create a concurrent generic document loader using a filesystem blob loader.

        Args:
            path: The path to the directory to load documents from.
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
                           Proxies to the file system loader.
            parser: A blob parser which knows how to parse blobs into documents
            num_workers: Max number of concurrent workers to use.
+            parser_kwargs: Keyword arguments to pass to the parser.
        """
        blob_loader = FileSystemBlobLoader(
            path,
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
            show_progress=show_progress,
        )
        if isinstance(parser, str):
-            blob_parser = get_parser(parser)
+            if parser == "default" and cls.get_parser != GenericLoader.get_parser:
+                # There is an implementation of get_parser on the class, use it.
+                blob_parser = cls.get_parser(**(parser_kwargs or {}))
+            else:
+                blob_parser = get_parser(parser)
        else:
            blob_parser = parser
        return cls(blob_loader, blob_parser, num_workers=num_workers)
--- a/libs/langchain/langchain/document_loaders/generic.py
+++ b/libs/langchain/langchain/document_loaders/generic.py
@@ -1,7 +1,7 @@
 from __future__ import annotations

 from pathlib import Path
-from typing import Iterator, List, Literal, Optional, Sequence, Union
+from typing import Any, Iterator, List, Literal, Optional, Sequence, Union

 from langchain_core.documents import Document

@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):

    Examples:

-       .. code-block:: python
-
-        from langchain.document_loaders import GenericLoader
-        from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
-
-        loader = GenericLoader.from_filesystem(
-            path="path/to/directory",
-            glob="**/[!.]*",
-            suffixes=[".pdf"],
-            show_progress=True,
-        )
-
-        docs = loader.lazy_load()
-        next(docs)
-
-        Example instantiations to change which files are loaded:
-
-        .. code-block:: python
-
-            # Recursively load all text files in a directory.
-            loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
-
-            # Recursively load all non-hidden files in a directory.
-            loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
-
-            # Load all files in a directory without recursion.
-            loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
-
-        Example instantiations to change which parser is used:
+        Parse a specific PDF file:

        .. code-block:: python

+            from langchain.document_loaders import GenericLoader
            from langchain.document_loaders.parsers.pdf import PyPDFParser

            # Recursively load all text files in a directory.
            loader = GenericLoader.from_filesystem(
-                "/path/to/dir",
-                glob="**/*.pdf",
+                "my_lovely_pdf.pdf",
                parser=PyPDFParser()
            )
+
+       .. code-block:: python
+
+            from langchain.document_loaders import GenericLoader
+            from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
+
+
+            loader = GenericLoader.from_filesystem(
+                path="path/to/directory",
+                glob="**/[!.]*",
+                suffixes=[".pdf"],
+                show_progress=True,
+            )
+
+            docs = loader.lazy_load()
+            next(docs)
+
+    Example instantiations to change which files are loaded:
+
+    .. code-block:: python
+
+        # Recursively load all text files in a directory.
+        loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
+
+        # Recursively load all non-hidden files in a directory.
+        loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
+
+        # Load all files in a directory without recursion.
+        loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
+
+    Example instantiations to change which parser is used:
+
+    .. code-block:: python
+
+        from langchain.document_loaders.parsers.pdf import PyPDFParser
+
+        # Recursively load all text files in a directory.
+        loader = GenericLoader.from_filesystem(
+            "/path/to/dir",
+            glob="**/*.pdf",
+            parser=PyPDFParser()
+        )
+
    """

    def __init__(
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
        suffixes: Optional[Sequence[str]] = None,
        show_progress: bool = False,
        parser: Union[DEFAULT, BaseBlobParser] = "default",
+        parser_kwargs: Optional[dict] = None,
    ) -> GenericLoader:
        """Create a generic document loader using a filesystem blob loader.

        Args:
-            path: The path to the directory to load documents from.
+            path: The path to the directory to load documents from OR the path to a
+                  single file to load. If this is a file, glob, exclude, suffixes
+                    will be ignored.
            glob: The glob pattern to use to find documents.
            suffixes: The suffixes to use to filter documents. If None, all files
                      matching the glob will be loaded.
            exclude: A list of patterns to exclude from the loader.
            show_progress: Whether to show a progress bar or not (requires tqdm).
                           Proxies to the file system loader.
-            parser: A blob parser which knows how to parse blobs into documents
+            parser: A blob parser which knows how to parse blobs into documents,
+                    will instantiate a default parser if not provided.
+                    The default can be overridden by either passing a parser or
+                    setting the class attribute `blob_parser` (the latter
+                    should be used with inheritance).
+            parser_kwargs: Keyword arguments to pass to the parser.

        Returns:
            A generic document loader.
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
            show_progress=show_progress,
        )
        if isinstance(parser, str):
-            blob_parser = get_parser(parser)
+            if parser == "default":
+                try:
+                    # If there is an implementation of get_parser on the class, use it.
+                    blob_parser = cls.get_parser(**(parser_kwargs or {}))
+                except NotImplementedError:
+                    # if not then use the global registry.
+                    blob_parser = get_parser(parser)
+            else:
+                blob_parser = get_parser(parser)
        else:
            blob_parser = parser
        return cls(blob_loader, blob_parser)
+
+    @staticmethod
+    def get_parser(**kwargs: Any) -> BaseBlobParser:
+        """Override this method to associate a default parser with the class."""
+        raise NotImplementedError()
--- a/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_generic_loader.py
@@ -2,7 +2,7 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Generator, Iterator
+from typing import Any, Generator, Iterator

 import pytest
 from langchain_core.documents import Document
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
 from langchain.document_loaders.generic import GenericLoader
+from langchain.document_loaders.parsers.txt import TextParser


@pytest.fixture
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
    assert docs[0].page_content == "This is a test.txt file."


+def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
+    loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
+    docs = loader.load()
+    assert len(docs) == 1
+    assert docs[0].page_content == "This is a test.txt file."
+
+
 def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
    """Test that glob parameter is taken into account."""
    loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
    # Glob order seems to be deterministic with recursion. If this test becomes flaky,
    # we can sort the docs by page content.
    assert docs[0].page_content == "This is a test.txt file."
+
+
+def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
+    class TextLoader(GenericLoader):
+        """Parser created for testing purposes."""
+
+        @staticmethod
+        def get_parser(**kwargs: Any) -> BaseBlobParser:
+            return TextParser()
+
+    loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
+    docs = loader.load()
+    assert len(docs) == 3
+    # Glob order seems to be deterministic with recursion. If this test becomes flaky,
+    # we can sort the docs by page content.
+    assert docs[0].page_content == "This is a test.txt file."