mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 22:42:05 +00:00
Improve file system blob loader and generic loader (#14004)
* Add support for passing a specific file to the file system blob loader * Allow specifying a class parameter for the parser for the generic loader ```python class AudioLoader(GenericLoader): @staticmethod def get_parser(**kwargs): return MyAudioParser(**kwargs): ``` The intent of the GenericLoader is to provide on-ramps from different sources (e.g., web, s3, file system). An alternative is to use pipelining syntax or creating a Pipeline ``` FileSystemBlobLoader(...) | MyAudioParser ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
"""Initialize with a path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory to load from
|
||||
path: Path to directory to load from or path to file to load.
|
||||
If a path to a file is provided, glob/exclude/suffixes are ignored.
|
||||
glob: Glob pattern relative to the specified path
|
||||
by default set to pick up all non-hidden files
|
||||
exclude: patterns to exclude from results, use glob syntax
|
||||
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
# Load a single file.
|
||||
loader = FileSystemBlobLoader("/path/to/file.txt")
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
|
||||
def _yield_paths(self) -> Iterable[Path]:
|
||||
"""Yield paths that match the requested pattern."""
|
||||
if self.path.is_file():
|
||||
yield self.path
|
||||
return
|
||||
|
||||
paths = self.path.glob(self.glob)
|
||||
for path in paths:
|
||||
if self.exclude:
|
||||
|
@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
|
||||
show_progress: bool = False,
|
||||
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
||||
num_workers: int = 4,
|
||||
parser_kwargs: Optional[dict] = None,
|
||||
) -> ConcurrentLoader:
|
||||
"""
|
||||
Create a concurrent generic document loader using a
|
||||
filesystem blob loader.
|
||||
|
||||
"""Create a concurrent generic document loader using a filesystem blob loader.
|
||||
|
||||
Args:
|
||||
path: The path to the directory to load documents from.
|
||||
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
|
||||
Proxies to the file system loader.
|
||||
parser: A blob parser which knows how to parse blobs into documents
|
||||
num_workers: Max number of concurrent workers to use.
|
||||
parser_kwargs: Keyword arguments to pass to the parser.
|
||||
"""
|
||||
blob_loader = FileSystemBlobLoader(
|
||||
path,
|
||||
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(parser, str):
|
||||
blob_parser = get_parser(parser)
|
||||
if parser == "default" and cls.get_parser != GenericLoader.get_parser:
|
||||
# There is an implementation of get_parser on the class, use it.
|
||||
blob_parser = cls.get_parser(**(parser_kwargs or {}))
|
||||
else:
|
||||
blob_parser = get_parser(parser)
|
||||
else:
|
||||
blob_parser = parser
|
||||
return cls(blob_loader, blob_parser, num_workers=num_workers)
|
||||
|
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Literal, Optional, Sequence, Union
|
||||
from typing import Any, Iterator, List, Literal, Optional, Sequence, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders import GenericLoader
|
||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
loader = GenericLoader.from_filesystem(
|
||||
path="path/to/directory",
|
||||
glob="**/[!.]*",
|
||||
suffixes=[".pdf"],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
docs = loader.lazy_load()
|
||||
next(docs)
|
||||
|
||||
Example instantiations to change which files are loaded:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
|
||||
|
||||
Example instantiations to change which parser is used:
|
||||
Parse a specific PDF file:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders import GenericLoader
|
||||
from langchain.document_loaders.parsers.pdf import PyPDFParser
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = GenericLoader.from_filesystem(
|
||||
"/path/to/dir",
|
||||
glob="**/*.pdf",
|
||||
"my_lovely_pdf.pdf",
|
||||
parser=PyPDFParser()
|
||||
)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders import GenericLoader
|
||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||
|
||||
|
||||
loader = GenericLoader.from_filesystem(
|
||||
path="path/to/directory",
|
||||
glob="**/[!.]*",
|
||||
suffixes=[".pdf"],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
docs = loader.lazy_load()
|
||||
next(docs)
|
||||
|
||||
Example instantiations to change which files are loaded:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
|
||||
|
||||
# Recursively load all non-hidden files in a directory.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
|
||||
|
||||
# Load all files in a directory without recursion.
|
||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
|
||||
|
||||
Example instantiations to change which parser is used:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders.parsers.pdf import PyPDFParser
|
||||
|
||||
# Recursively load all text files in a directory.
|
||||
loader = GenericLoader.from_filesystem(
|
||||
"/path/to/dir",
|
||||
glob="**/*.pdf",
|
||||
parser=PyPDFParser()
|
||||
)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
show_progress: bool = False,
|
||||
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
||||
parser_kwargs: Optional[dict] = None,
|
||||
) -> GenericLoader:
|
||||
"""Create a generic document loader using a filesystem blob loader.
|
||||
|
||||
Args:
|
||||
path: The path to the directory to load documents from.
|
||||
path: The path to the directory to load documents from OR the path to a
|
||||
single file to load. If this is a file, glob, exclude, suffixes
|
||||
will be ignored.
|
||||
glob: The glob pattern to use to find documents.
|
||||
suffixes: The suffixes to use to filter documents. If None, all files
|
||||
matching the glob will be loaded.
|
||||
exclude: A list of patterns to exclude from the loader.
|
||||
show_progress: Whether to show a progress bar or not (requires tqdm).
|
||||
Proxies to the file system loader.
|
||||
parser: A blob parser which knows how to parse blobs into documents
|
||||
parser: A blob parser which knows how to parse blobs into documents,
|
||||
will instantiate a default parser if not provided.
|
||||
The default can be overridden by either passing a parser or
|
||||
setting the class attribute `blob_parser` (the latter
|
||||
should be used with inheritance).
|
||||
parser_kwargs: Keyword arguments to pass to the parser.
|
||||
|
||||
Returns:
|
||||
A generic document loader.
|
||||
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(parser, str):
|
||||
blob_parser = get_parser(parser)
|
||||
if parser == "default":
|
||||
try:
|
||||
# If there is an implementation of get_parser on the class, use it.
|
||||
blob_parser = cls.get_parser(**(parser_kwargs or {}))
|
||||
except NotImplementedError:
|
||||
# if not then use the global registry.
|
||||
blob_parser = get_parser(parser)
|
||||
else:
|
||||
blob_parser = get_parser(parser)
|
||||
else:
|
||||
blob_parser = parser
|
||||
return cls(blob_loader, blob_parser)
|
||||
|
||||
@staticmethod
|
||||
def get_parser(**kwargs: Any) -> BaseBlobParser:
|
||||
"""Override this method to associate a default parser with the class."""
|
||||
raise NotImplementedError()
|
||||
|
@@ -2,7 +2,7 @@
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Generator, Iterator
|
||||
from typing import Any, Generator, Iterator
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
|
||||
from langchain.document_loaders.generic import GenericLoader
|
||||
from langchain.document_loaders.parsers.txt import TextParser
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
|
||||
assert docs[0].page_content == "This is a test.txt file."
|
||||
|
||||
|
||||
def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
|
||||
loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
assert docs[0].page_content == "This is a test.txt file."
|
||||
|
||||
|
||||
def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
|
||||
"""Test that glob parameter is taken into account."""
|
||||
loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
|
||||
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
|
||||
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
|
||||
# we can sort the docs by page content.
|
||||
assert docs[0].page_content == "This is a test.txt file."
|
||||
|
||||
|
||||
def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
|
||||
class TextLoader(GenericLoader):
|
||||
"""Parser created for testing purposes."""
|
||||
|
||||
@staticmethod
|
||||
def get_parser(**kwargs: Any) -> BaseBlobParser:
|
||||
return TextParser()
|
||||
|
||||
loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
|
||||
docs = loader.load()
|
||||
assert len(docs) == 3
|
||||
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
|
||||
# we can sort the docs by page content.
|
||||
assert docs[0].page_content == "This is a test.txt file."
|
||||
|
Reference in New Issue
Block a user