mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-16 17:53:37 +00:00
Add lazy iteration interface to document loaders (#3659)
Adding a lazy iteration for document loaders. Following the plan here: https://github.com/hwchase17/langchain/pull/2833 Keeping the `load` method as is for backwards compatibility. The `load` returns a materialized list of documents and downstream users may rely on that fact. A new method that returns an iterable is introduced for handling lazy loading. --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
This commit is contained in:
parent
8a54217e7b
commit
2052e70664
@ -1,15 +1,25 @@
|
|||||||
"""Base loader class."""
|
"""Abstract interface for document loader implementations."""
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Optional
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||||
|
|
||||||
|
|
||||||
class BaseLoader(ABC):
|
class BaseLoader(ABC):
|
||||||
"""Base loader class."""
|
"""Interface for loading documents.
|
||||||
|
|
||||||
|
Implementations should implement the lazy-loading method using generators
|
||||||
|
to avoid loading all documents into memory at once.
|
||||||
|
|
||||||
|
The `load` method will remain as is for backwards compatibility, but it's
|
||||||
|
implementation should be just `list(self.lazy_load())`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Sub-classes should implement this method
|
||||||
|
# as return list(self.lazy_load()).
|
||||||
|
# This method returns a List which is materialized in memory.
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load data into document objects."""
|
"""Load data into document objects."""
|
||||||
@ -24,3 +34,13 @@ class BaseLoader(ABC):
|
|||||||
_text_splitter = text_splitter
|
_text_splitter = text_splitter
|
||||||
docs = self.load()
|
docs = self.load()
|
||||||
return _text_splitter.split_documents(docs)
|
return _text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
# Attention: This method will be upgraded into an abstractmethod once it's
|
||||||
|
# implemented in all the existing subclasses.
|
||||||
|
def lazy_load(
|
||||||
|
self,
|
||||||
|
) -> Iterable[Document]:
|
||||||
|
"""A lazy loader for document content."""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{self.__class__.__name__} does not implement lazy_load()"
|
||||||
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user