Improve file system blob loader and generic loader (#14004)

* Add support for passing a specific file to the file system blob loader
* Allow specifying a class parameter for the parser for the generic
loader

```python

class AudioLoader(GenericLoader):
  @staticmethod
  def get_parser(**kwargs):
     return MyAudioParser(**kwargs):
```

The intent of the GenericLoader is to provide on-ramps from different
sources (e.g., web, s3, file system).

An alternative is to use pipelining syntax or creating a Pipeline

```
FileSystemBlobLoader(...) | MyAudioParser
```

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Eugene Yurtsev
2023-12-01 21:23:40 -05:00
committed by GitHub
parent 700428593a
commit 6d0209e0aa
4 changed files with 114 additions and 42 deletions

View File

@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
"""Initialize with a path to directory and how to glob over it. """Initialize with a path to directory and how to glob over it.
Args: Args:
path: Path to directory to load from path: Path to directory to load from or path to file to load.
If a path to a file is provided, glob/exclude/suffixes are ignored.
glob: Glob pattern relative to the specified path glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files by default set to pick up all non-hidden files
exclude: patterns to exclude from results, use glob syntax exclude: patterns to exclude from results, use glob syntax
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
Examples: Examples:
.. code-block:: python .. code-block:: python
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
# Load a single file.
loader = FileSystemBlobLoader("/path/to/file.txt")
# Recursively load all text files in a directory. # Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):
def _yield_paths(self) -> Iterable[Path]: def _yield_paths(self) -> Iterable[Path]:
"""Yield paths that match the requested pattern.""" """Yield paths that match the requested pattern."""
if self.path.is_file():
yield self.path
return
paths = self.path.glob(self.glob) paths = self.path.glob(self.glob)
for path in paths: for path in paths:
if self.exclude: if self.exclude:

View File

@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
show_progress: bool = False, show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default", parser: Union[DEFAULT, BaseBlobParser] = "default",
num_workers: int = 4, num_workers: int = 4,
parser_kwargs: Optional[dict] = None,
) -> ConcurrentLoader: ) -> ConcurrentLoader:
""" """Create a concurrent generic document loader using a filesystem blob loader.
Create a concurrent generic document loader using a
filesystem blob loader.
Args: Args:
path: The path to the directory to load documents from. path: The path to the directory to load documents from.
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
Proxies to the file system loader. Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents parser: A blob parser which knows how to parse blobs into documents
num_workers: Max number of concurrent workers to use. num_workers: Max number of concurrent workers to use.
parser_kwargs: Keyword arguments to pass to the parser.
""" """
blob_loader = FileSystemBlobLoader( blob_loader = FileSystemBlobLoader(
path, path,
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
show_progress=show_progress, show_progress=show_progress,
) )
if isinstance(parser, str): if isinstance(parser, str):
blob_parser = get_parser(parser) if parser == "default" and cls.get_parser != GenericLoader.get_parser:
# There is an implementation of get_parser on the class, use it.
blob_parser = cls.get_parser(**(parser_kwargs or {}))
else:
blob_parser = get_parser(parser)
else: else:
blob_parser = parser blob_parser = parser
return cls(blob_loader, blob_parser, num_workers=num_workers) return cls(blob_loader, blob_parser, num_workers=num_workers)

View File

@@ -1,7 +1,7 @@
from __future__ import annotations from __future__ import annotations
from pathlib import Path from pathlib import Path
from typing import Iterator, List, Literal, Optional, Sequence, Union from typing import Any, Iterator, List, Literal, Optional, Sequence, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):
Examples: Examples:
.. code-block:: python Parse a specific PDF file:
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = GenericLoader.from_filesystem(
path="path/to/directory",
glob="**/[!.]*",
suffixes=[".pdf"],
show_progress=True,
)
docs = loader.lazy_load()
next(docs)
Example instantiations to change which files are loaded:
.. code-block:: python
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
Example instantiations to change which parser is used:
.. code-block:: python .. code-block:: python
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.parsers.pdf import PyPDFParser from langchain.document_loaders.parsers.pdf import PyPDFParser
# Recursively load all text files in a directory. # Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem( loader = GenericLoader.from_filesystem(
"/path/to/dir", "my_lovely_pdf.pdf",
glob="**/*.pdf",
parser=PyPDFParser() parser=PyPDFParser()
) )
.. code-block:: python
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = GenericLoader.from_filesystem(
path="path/to/directory",
glob="**/[!.]*",
suffixes=[".pdf"],
show_progress=True,
)
docs = loader.lazy_load()
next(docs)
Example instantiations to change which files are loaded:
.. code-block:: python
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
Example instantiations to change which parser is used:
.. code-block:: python
from langchain.document_loaders.parsers.pdf import PyPDFParser
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem(
"/path/to/dir",
glob="**/*.pdf",
parser=PyPDFParser()
)
""" """
def __init__( def __init__(
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
suffixes: Optional[Sequence[str]] = None, suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False, show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default", parser: Union[DEFAULT, BaseBlobParser] = "default",
parser_kwargs: Optional[dict] = None,
) -> GenericLoader: ) -> GenericLoader:
"""Create a generic document loader using a filesystem blob loader. """Create a generic document loader using a filesystem blob loader.
Args: Args:
path: The path to the directory to load documents from. path: The path to the directory to load documents from OR the path to a
single file to load. If this is a file, glob, exclude, suffixes
will be ignored.
glob: The glob pattern to use to find documents. glob: The glob pattern to use to find documents.
suffixes: The suffixes to use to filter documents. If None, all files suffixes: The suffixes to use to filter documents. If None, all files
matching the glob will be loaded. matching the glob will be loaded.
exclude: A list of patterns to exclude from the loader. exclude: A list of patterns to exclude from the loader.
show_progress: Whether to show a progress bar or not (requires tqdm). show_progress: Whether to show a progress bar or not (requires tqdm).
Proxies to the file system loader. Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents parser: A blob parser which knows how to parse blobs into documents,
will instantiate a default parser if not provided.
The default can be overridden by either passing a parser or
setting the class attribute `blob_parser` (the latter
should be used with inheritance).
parser_kwargs: Keyword arguments to pass to the parser.
Returns: Returns:
A generic document loader. A generic document loader.
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
show_progress=show_progress, show_progress=show_progress,
) )
if isinstance(parser, str): if isinstance(parser, str):
blob_parser = get_parser(parser) if parser == "default":
try:
# If there is an implementation of get_parser on the class, use it.
blob_parser = cls.get_parser(**(parser_kwargs or {}))
except NotImplementedError:
# if not then use the global registry.
blob_parser = get_parser(parser)
else:
blob_parser = get_parser(parser)
else: else:
blob_parser = parser blob_parser = parser
return cls(blob_loader, blob_parser) return cls(blob_loader, blob_parser)
@staticmethod
def get_parser(**kwargs: Any) -> BaseBlobParser:
"""Override this method to associate a default parser with the class."""
raise NotImplementedError()

View File

@@ -2,7 +2,7 @@
import os import os
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Generator, Iterator from typing import Any, Generator, Iterator
import pytest import pytest
from langchain_core.documents import Document from langchain_core.documents import Document
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
from langchain.document_loaders.generic import GenericLoader from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers.txt import TextParser
@pytest.fixture @pytest.fixture
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
assert docs[0].page_content == "This is a test.txt file." assert docs[0].page_content == "This is a test.txt file."
def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == "This is a test.txt file."
def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None: def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
"""Test that glob parameter is taken into account.""" """Test that glob parameter is taken into account."""
loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser()) loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
# Glob order seems to be deterministic with recursion. If this test becomes flaky, # Glob order seems to be deterministic with recursion. If this test becomes flaky,
# we can sort the docs by page content. # we can sort the docs by page content.
assert docs[0].page_content == "This is a test.txt file." assert docs[0].page_content == "This is a test.txt file."
def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
class TextLoader(GenericLoader):
"""Parser created for testing purposes."""
@staticmethod
def get_parser(**kwargs: Any) -> BaseBlobParser:
return TextParser()
loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
docs = loader.load()
assert len(docs) == 3
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
# we can sort the docs by page content.
assert docs[0].page_content == "This is a test.txt file."