Improve file system blob loader and generic loader (#14004)

* Add support for passing a specific file to the file system blob loader
* Allow specifying a class parameter for the parser for the generic
loader

```python

class AudioLoader(GenericLoader):
  @staticmethod
  def get_parser(**kwargs):
     return MyAudioParser(**kwargs):
```

The intent of the GenericLoader is to provide on-ramps from different
sources (e.g., web, s3, file system).

An alternative is to use pipelining syntax or creating a Pipeline

```
FileSystemBlobLoader(...) | MyAudioParser
```

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Eugene Yurtsev
2023-12-01 21:23:40 -05:00
committed by GitHub
parent 700428593a
commit 6d0209e0aa
4 changed files with 114 additions and 42 deletions

View File

@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
"""Initialize with a path to directory and how to glob over it.
Args:
path: Path to directory to load from
path: Path to directory to load from or path to file to load.
If a path to a file is provided, glob/exclude/suffixes are ignored.
glob: Glob pattern relative to the specified path
by default set to pick up all non-hidden files
exclude: patterns to exclude from results, use glob syntax
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
Examples:
.. code-block:: python
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
# Load a single file.
loader = FileSystemBlobLoader("/path/to/file.txt")
# Recursively load all text files in a directory.
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):
def _yield_paths(self) -> Iterable[Path]:
"""Yield paths that match the requested pattern."""
if self.path.is_file():
yield self.path
return
paths = self.path.glob(self.glob)
for path in paths:
if self.exclude:

View File

@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default",
num_workers: int = 4,
parser_kwargs: Optional[dict] = None,
) -> ConcurrentLoader:
"""
Create a concurrent generic document loader using a
filesystem blob loader.
"""Create a concurrent generic document loader using a filesystem blob loader.
Args:
path: The path to the directory to load documents from.
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents
num_workers: Max number of concurrent workers to use.
parser_kwargs: Keyword arguments to pass to the parser.
"""
blob_loader = FileSystemBlobLoader(
path,
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
show_progress=show_progress,
)
if isinstance(parser, str):
blob_parser = get_parser(parser)
if parser == "default" and cls.get_parser != GenericLoader.get_parser:
# There is an implementation of get_parser on the class, use it.
blob_parser = cls.get_parser(**(parser_kwargs or {}))
else:
blob_parser = get_parser(parser)
else:
blob_parser = parser
return cls(blob_loader, blob_parser, num_workers=num_workers)

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Sequence, Union
from typing import Any, Iterator, List, Literal, Optional, Sequence, Union
from langchain_core.documents import Document
@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):
Examples:
.. code-block:: python
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = GenericLoader.from_filesystem(
path="path/to/directory",
glob="**/[!.]*",
suffixes=[".pdf"],
show_progress=True,
)
docs = loader.lazy_load()
next(docs)
Example instantiations to change which files are loaded:
.. code-block:: python
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
Example instantiations to change which parser is used:
Parse a specific PDF file:
.. code-block:: python
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.parsers.pdf import PyPDFParser
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem(
"/path/to/dir",
glob="**/*.pdf",
"my_lovely_pdf.pdf",
parser=PyPDFParser()
)
.. code-block:: python
from langchain.document_loaders import GenericLoader
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
loader = GenericLoader.from_filesystem(
path="path/to/directory",
glob="**/[!.]*",
suffixes=[".pdf"],
show_progress=True,
)
docs = loader.lazy_load()
next(docs)
Example instantiations to change which files are loaded:
.. code-block:: python
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
# Recursively load all non-hidden files in a directory.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
# Load all files in a directory without recursion.
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
Example instantiations to change which parser is used:
.. code-block:: python
from langchain.document_loaders.parsers.pdf import PyPDFParser
# Recursively load all text files in a directory.
loader = GenericLoader.from_filesystem(
"/path/to/dir",
glob="**/*.pdf",
parser=PyPDFParser()
)
"""
def __init__(
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False,
parser: Union[DEFAULT, BaseBlobParser] = "default",
parser_kwargs: Optional[dict] = None,
) -> GenericLoader:
"""Create a generic document loader using a filesystem blob loader.
Args:
path: The path to the directory to load documents from.
path: The path to the directory to load documents from OR the path to a
single file to load. If this is a file, glob, exclude, suffixes
will be ignored.
glob: The glob pattern to use to find documents.
suffixes: The suffixes to use to filter documents. If None, all files
matching the glob will be loaded.
exclude: A list of patterns to exclude from the loader.
show_progress: Whether to show a progress bar or not (requires tqdm).
Proxies to the file system loader.
parser: A blob parser which knows how to parse blobs into documents
parser: A blob parser which knows how to parse blobs into documents,
will instantiate a default parser if not provided.
The default can be overridden by either passing a parser or
setting the class attribute `blob_parser` (the latter
should be used with inheritance).
parser_kwargs: Keyword arguments to pass to the parser.
Returns:
A generic document loader.
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
show_progress=show_progress,
)
if isinstance(parser, str):
blob_parser = get_parser(parser)
if parser == "default":
try:
# If there is an implementation of get_parser on the class, use it.
blob_parser = cls.get_parser(**(parser_kwargs or {}))
except NotImplementedError:
# if not then use the global registry.
blob_parser = get_parser(parser)
else:
blob_parser = get_parser(parser)
else:
blob_parser = parser
return cls(blob_loader, blob_parser)
@staticmethod
def get_parser(**kwargs: Any) -> BaseBlobParser:
"""Override this method to associate a default parser with the class."""
raise NotImplementedError()

View File

@@ -2,7 +2,7 @@
import os
import tempfile
from pathlib import Path
from typing import Generator, Iterator
from typing import Any, Generator, Iterator
import pytest
from langchain_core.documents import Document
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers.txt import TextParser
@pytest.fixture
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
assert docs[0].page_content == "This is a test.txt file."
def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == "This is a test.txt file."
def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
"""Test that glob parameter is taken into account."""
loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
# we can sort the docs by page content.
assert docs[0].page_content == "This is a test.txt file."
def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
class TextLoader(GenericLoader):
"""Parser created for testing purposes."""
@staticmethod
def get_parser(**kwargs: Any) -> BaseBlobParser:
return TextParser()
loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
docs = loader.load()
assert len(docs) == 3
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
# we can sort the docs by page content.
assert docs[0].page_content == "This is a test.txt file."