mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 07:21:03 +00:00
Improve file system blob loader and generic loader (#14004)
* Add support for passing a specific file to the file system blob loader * Allow specifying a class parameter for the parser for the generic loader ```python class AudioLoader(GenericLoader): @staticmethod def get_parser(**kwargs): return MyAudioParser(**kwargs): ``` The intent of the GenericLoader is to provide on-ramps from different sources (e.g., web, s3, file system). An alternative is to use pipelining syntax or creating a Pipeline ``` FileSystemBlobLoader(...) | MyAudioParser ``` --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -61,7 +61,8 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
"""Initialize with a path to directory and how to glob over it.
|
"""Initialize with a path to directory and how to glob over it.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path to directory to load from
|
path: Path to directory to load from or path to file to load.
|
||||||
|
If a path to a file is provided, glob/exclude/suffixes are ignored.
|
||||||
glob: Glob pattern relative to the specified path
|
glob: Glob pattern relative to the specified path
|
||||||
by default set to pick up all non-hidden files
|
by default set to pick up all non-hidden files
|
||||||
exclude: patterns to exclude from results, use glob syntax
|
exclude: patterns to exclude from results, use glob syntax
|
||||||
@@ -75,6 +76,10 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||||
|
|
||||||
|
# Load a single file.
|
||||||
|
loader = FileSystemBlobLoader("/path/to/file.txt")
|
||||||
|
|
||||||
# Recursively load all text files in a directory.
|
# Recursively load all text files in a directory.
|
||||||
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
|
||||||
@@ -118,6 +123,10 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
|
|
||||||
def _yield_paths(self) -> Iterable[Path]:
|
def _yield_paths(self) -> Iterable[Path]:
|
||||||
"""Yield paths that match the requested pattern."""
|
"""Yield paths that match the requested pattern."""
|
||||||
|
if self.path.is_file():
|
||||||
|
yield self.path
|
||||||
|
return
|
||||||
|
|
||||||
paths = self.path.glob(self.glob)
|
paths = self.path.glob(self.glob)
|
||||||
for path in paths:
|
for path in paths:
|
||||||
if self.exclude:
|
if self.exclude:
|
||||||
|
@@ -50,11 +50,9 @@ class ConcurrentLoader(GenericLoader):
|
|||||||
show_progress: bool = False,
|
show_progress: bool = False,
|
||||||
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
||||||
num_workers: int = 4,
|
num_workers: int = 4,
|
||||||
|
parser_kwargs: Optional[dict] = None,
|
||||||
) -> ConcurrentLoader:
|
) -> ConcurrentLoader:
|
||||||
"""
|
"""Create a concurrent generic document loader using a filesystem blob loader.
|
||||||
Create a concurrent generic document loader using a
|
|
||||||
filesystem blob loader.
|
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: The path to the directory to load documents from.
|
path: The path to the directory to load documents from.
|
||||||
@@ -66,6 +64,7 @@ class ConcurrentLoader(GenericLoader):
|
|||||||
Proxies to the file system loader.
|
Proxies to the file system loader.
|
||||||
parser: A blob parser which knows how to parse blobs into documents
|
parser: A blob parser which knows how to parse blobs into documents
|
||||||
num_workers: Max number of concurrent workers to use.
|
num_workers: Max number of concurrent workers to use.
|
||||||
|
parser_kwargs: Keyword arguments to pass to the parser.
|
||||||
"""
|
"""
|
||||||
blob_loader = FileSystemBlobLoader(
|
blob_loader = FileSystemBlobLoader(
|
||||||
path,
|
path,
|
||||||
@@ -75,7 +74,11 @@ class ConcurrentLoader(GenericLoader):
|
|||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
)
|
)
|
||||||
if isinstance(parser, str):
|
if isinstance(parser, str):
|
||||||
blob_parser = get_parser(parser)
|
if parser == "default" and cls.get_parser != GenericLoader.get_parser:
|
||||||
|
# There is an implementation of get_parser on the class, use it.
|
||||||
|
blob_parser = cls.get_parser(**(parser_kwargs or {}))
|
||||||
|
else:
|
||||||
|
blob_parser = get_parser(parser)
|
||||||
else:
|
else:
|
||||||
blob_parser = parser
|
blob_parser = parser
|
||||||
return cls(blob_loader, blob_parser, num_workers=num_workers)
|
return cls(blob_loader, blob_parser, num_workers=num_workers)
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List, Literal, Optional, Sequence, Union
|
from typing import Any, Iterator, List, Literal, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@@ -23,46 +23,61 @@ class GenericLoader(BaseLoader):
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
.. code-block:: python
|
Parse a specific PDF file:
|
||||||
|
|
||||||
from langchain.document_loaders import GenericLoader
|
|
||||||
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
|
||||||
|
|
||||||
loader = GenericLoader.from_filesystem(
|
|
||||||
path="path/to/directory",
|
|
||||||
glob="**/[!.]*",
|
|
||||||
suffixes=[".pdf"],
|
|
||||||
show_progress=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
docs = loader.lazy_load()
|
|
||||||
next(docs)
|
|
||||||
|
|
||||||
Example instantiations to change which files are loaded:
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
# Recursively load all text files in a directory.
|
|
||||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
|
|
||||||
|
|
||||||
# Recursively load all non-hidden files in a directory.
|
|
||||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
|
|
||||||
|
|
||||||
# Load all files in a directory without recursion.
|
|
||||||
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
|
|
||||||
|
|
||||||
Example instantiations to change which parser is used:
|
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders import GenericLoader
|
||||||
from langchain.document_loaders.parsers.pdf import PyPDFParser
|
from langchain.document_loaders.parsers.pdf import PyPDFParser
|
||||||
|
|
||||||
# Recursively load all text files in a directory.
|
# Recursively load all text files in a directory.
|
||||||
loader = GenericLoader.from_filesystem(
|
loader = GenericLoader.from_filesystem(
|
||||||
"/path/to/dir",
|
"my_lovely_pdf.pdf",
|
||||||
glob="**/*.pdf",
|
|
||||||
parser=PyPDFParser()
|
parser=PyPDFParser()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders import GenericLoader
|
||||||
|
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
|
||||||
|
|
||||||
|
|
||||||
|
loader = GenericLoader.from_filesystem(
|
||||||
|
path="path/to/directory",
|
||||||
|
glob="**/[!.]*",
|
||||||
|
suffixes=[".pdf"],
|
||||||
|
show_progress=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
docs = loader.lazy_load()
|
||||||
|
next(docs)
|
||||||
|
|
||||||
|
Example instantiations to change which files are loaded:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
# Recursively load all text files in a directory.
|
||||||
|
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt")
|
||||||
|
|
||||||
|
# Recursively load all non-hidden files in a directory.
|
||||||
|
loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*")
|
||||||
|
|
||||||
|
# Load all files in a directory without recursion.
|
||||||
|
loader = GenericLoader.from_filesystem("/path/to/dir", glob="*")
|
||||||
|
|
||||||
|
Example instantiations to change which parser is used:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.document_loaders.parsers.pdf import PyPDFParser
|
||||||
|
|
||||||
|
# Recursively load all text files in a directory.
|
||||||
|
loader = GenericLoader.from_filesystem(
|
||||||
|
"/path/to/dir",
|
||||||
|
glob="**/*.pdf",
|
||||||
|
parser=PyPDFParser()
|
||||||
|
)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -110,18 +125,26 @@ class GenericLoader(BaseLoader):
|
|||||||
suffixes: Optional[Sequence[str]] = None,
|
suffixes: Optional[Sequence[str]] = None,
|
||||||
show_progress: bool = False,
|
show_progress: bool = False,
|
||||||
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
parser: Union[DEFAULT, BaseBlobParser] = "default",
|
||||||
|
parser_kwargs: Optional[dict] = None,
|
||||||
) -> GenericLoader:
|
) -> GenericLoader:
|
||||||
"""Create a generic document loader using a filesystem blob loader.
|
"""Create a generic document loader using a filesystem blob loader.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: The path to the directory to load documents from.
|
path: The path to the directory to load documents from OR the path to a
|
||||||
|
single file to load. If this is a file, glob, exclude, suffixes
|
||||||
|
will be ignored.
|
||||||
glob: The glob pattern to use to find documents.
|
glob: The glob pattern to use to find documents.
|
||||||
suffixes: The suffixes to use to filter documents. If None, all files
|
suffixes: The suffixes to use to filter documents. If None, all files
|
||||||
matching the glob will be loaded.
|
matching the glob will be loaded.
|
||||||
exclude: A list of patterns to exclude from the loader.
|
exclude: A list of patterns to exclude from the loader.
|
||||||
show_progress: Whether to show a progress bar or not (requires tqdm).
|
show_progress: Whether to show a progress bar or not (requires tqdm).
|
||||||
Proxies to the file system loader.
|
Proxies to the file system loader.
|
||||||
parser: A blob parser which knows how to parse blobs into documents
|
parser: A blob parser which knows how to parse blobs into documents,
|
||||||
|
will instantiate a default parser if not provided.
|
||||||
|
The default can be overridden by either passing a parser or
|
||||||
|
setting the class attribute `blob_parser` (the latter
|
||||||
|
should be used with inheritance).
|
||||||
|
parser_kwargs: Keyword arguments to pass to the parser.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A generic document loader.
|
A generic document loader.
|
||||||
@@ -134,7 +157,20 @@ class GenericLoader(BaseLoader):
|
|||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
)
|
)
|
||||||
if isinstance(parser, str):
|
if isinstance(parser, str):
|
||||||
blob_parser = get_parser(parser)
|
if parser == "default":
|
||||||
|
try:
|
||||||
|
# If there is an implementation of get_parser on the class, use it.
|
||||||
|
blob_parser = cls.get_parser(**(parser_kwargs or {}))
|
||||||
|
except NotImplementedError:
|
||||||
|
# if not then use the global registry.
|
||||||
|
blob_parser = get_parser(parser)
|
||||||
|
else:
|
||||||
|
blob_parser = get_parser(parser)
|
||||||
else:
|
else:
|
||||||
blob_parser = parser
|
blob_parser = parser
|
||||||
return cls(blob_loader, blob_parser)
|
return cls(blob_loader, blob_parser)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_parser(**kwargs: Any) -> BaseBlobParser:
|
||||||
|
"""Override this method to associate a default parser with the class."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Generator, Iterator
|
from typing import Any, Generator, Iterator
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@@ -10,6 +10,7 @@ from langchain_core.documents import Document
|
|||||||
from langchain.document_loaders.base import BaseBlobParser
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
|
from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader
|
||||||
from langchain.document_loaders.generic import GenericLoader
|
from langchain.document_loaders.generic import GenericLoader
|
||||||
|
from langchain.document_loaders.parsers.txt import TextParser
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -78,6 +79,13 @@ def test_from_filesystem_classmethod(toy_dir: str) -> None:
|
|||||||
assert docs[0].page_content == "This is a test.txt file."
|
assert docs[0].page_content == "This is a test.txt file."
|
||||||
|
|
||||||
|
|
||||||
|
def test_from_filesystem_classmethod_with_path(toy_dir: str) -> None:
|
||||||
|
loader = GenericLoader.from_filesystem(os.path.join(toy_dir, "test.txt"))
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content == "This is a test.txt file."
|
||||||
|
|
||||||
|
|
||||||
def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
|
def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None:
|
||||||
"""Test that glob parameter is taken into account."""
|
"""Test that glob parameter is taken into account."""
|
||||||
loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
|
loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser())
|
||||||
@@ -112,3 +120,19 @@ def test_from_filesystem_using_default_parser(toy_dir: str) -> None:
|
|||||||
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
|
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
|
||||||
# we can sort the docs by page content.
|
# we can sort the docs by page content.
|
||||||
assert docs[0].page_content == "This is a test.txt file."
|
assert docs[0].page_content == "This is a test.txt file."
|
||||||
|
|
||||||
|
|
||||||
|
def test_specifying_parser_via_class_attribute(toy_dir: str) -> None:
|
||||||
|
class TextLoader(GenericLoader):
|
||||||
|
"""Parser created for testing purposes."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_parser(**kwargs: Any) -> BaseBlobParser:
|
||||||
|
return TextParser()
|
||||||
|
|
||||||
|
loader = TextLoader.from_filesystem(toy_dir, suffixes=[".txt"])
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 3
|
||||||
|
# Glob order seems to be deterministic with recursion. If this test becomes flaky,
|
||||||
|
# we can sort the docs by page content.
|
||||||
|
assert docs[0].page_content == "This is a test.txt file."
|
||||||
|
Reference in New Issue
Block a user