diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py index b89f8785abb..3d1a12fc54a 100644 --- a/langchain/document_loaders/base.py +++ b/langchain/document_loaders/base.py @@ -1,9 +1,10 @@ """Abstract interface for document loader implementations.""" - +import abc from abc import ABC, abstractmethod -from typing import Iterable, List, Optional +from typing import Iterable, Iterator, List, Optional -from langchain.docstore.document import Document +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter @@ -44,3 +45,44 @@ class BaseLoader(ABC): raise NotImplementedError( f"{self.__class__.__name__} does not implement lazy_load()" ) + + +class BaseBlobParser(abc.ABC): + """Abstract interface for blob parsers. + + A blob parser is provides a way to parse raw data stored in a blob into one + or more documents. + + The parser can be composed with blob loaders, making it easy to re-use + a parser independent of how the blob was originally loaded. + """ + + @abc.abstractmethod + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazy parsing interface. + + Subclasses are required to implement this method. + + Args: + blob: Blob instance + + Returns: + Generator of documents + """ + + def parse(self, blob: Blob) -> List[Document]: + """Eagerly parse the blob into a document or documents. + + This is a convenience method for interactive development environment. + + Production applications should favor the lazy_parse method instead. + + Subclasses should generally not over-ride this parse method. + + Args: + blob: Blob instance + + Returns: + List of documents + """ + return list(self.lazy_parse(blob)) diff --git a/tests/unit_tests/document_loader/test_base.py b/tests/unit_tests/document_loader/test_base.py new file mode 100644 index 00000000000..544113993c2 --- /dev/null +++ b/tests/unit_tests/document_loader/test_base.py @@ -0,0 +1,28 @@ +"""Test Base Schema of documents.""" +from typing import Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document + + +def test_base_blob_parser() -> None: + """Verify that the eager method is hooked up to the lazy method by default.""" + + class MyParser(BaseBlobParser): + """A simple parser that returns a single document.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazy parsing interface.""" + yield Document( + page_content="foo", + ) + + parser = MyParser() + + assert isinstance(parser.lazy_parse(Blob(data="who?")), Iterator) + + # We're verifying that the eager method is hooked up to the lazy method by default. + docs = parser.parse(Blob(data="who?")) + assert len(docs) == 1 + assert docs[0].page_content == "foo"