mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 22:29:51 +00:00
Add BlobParser abstraction (#3979)
This PR adds the BlobParser abstraction. It follows the proposal described here: https://github.com/hwchase17/langchain/pull/2833#issuecomment-1509097756
This commit is contained in:
parent
5ca13cc1f0
commit
423f497168
@ -1,9 +1,10 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
import abc
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Iterable, List, Optional
|
from typing import Iterable, Iterator, List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
|
||||||
|
|
||||||
@ -44,3 +45,44 @@ class BaseLoader(ABC):
|
|||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"{self.__class__.__name__} does not implement lazy_load()"
|
f"{self.__class__.__name__} does not implement lazy_load()"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseBlobParser(abc.ABC):
|
||||||
|
"""Abstract interface for blob parsers.
|
||||||
|
|
||||||
|
A blob parser is provides a way to parse raw data stored in a blob into one
|
||||||
|
or more documents.
|
||||||
|
|
||||||
|
The parser can be composed with blob loaders, making it easy to re-use
|
||||||
|
a parser independent of how the blob was originally loaded.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazy parsing interface.
|
||||||
|
|
||||||
|
Subclasses are required to implement this method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: Blob instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generator of documents
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse(self, blob: Blob) -> List[Document]:
|
||||||
|
"""Eagerly parse the blob into a document or documents.
|
||||||
|
|
||||||
|
This is a convenience method for interactive development environment.
|
||||||
|
|
||||||
|
Production applications should favor the lazy_parse method instead.
|
||||||
|
|
||||||
|
Subclasses should generally not over-ride this parse method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: Blob instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of documents
|
||||||
|
"""
|
||||||
|
return list(self.lazy_parse(blob))
|
||||||
|
28
tests/unit_tests/document_loader/test_base.py
Normal file
28
tests/unit_tests/document_loader/test_base.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
"""Test Base Schema of documents."""
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
def test_base_blob_parser() -> None:
|
||||||
|
"""Verify that the eager method is hooked up to the lazy method by default."""
|
||||||
|
|
||||||
|
class MyParser(BaseBlobParser):
|
||||||
|
"""A simple parser that returns a single document."""
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazy parsing interface."""
|
||||||
|
yield Document(
|
||||||
|
page_content="foo",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = MyParser()
|
||||||
|
|
||||||
|
assert isinstance(parser.lazy_parse(Blob(data="who?")), Iterator)
|
||||||
|
|
||||||
|
# We're verifying that the eager method is hooked up to the lazy method by default.
|
||||||
|
docs = parser.parse(Blob(data="who?"))
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content == "foo"
|
Loading…
Reference in New Issue
Block a user