mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 00:47:27 +00:00
Add lazy iteration interface to document loaders (#3659)
Adding a lazy iteration for document loaders. Following the plan here: https://github.com/hwchase17/langchain/pull/2833 Keeping the `load` method as is for backwards compatibility. The `load` returns a materialized list of documents and downstream users may rely on that fact. A new method that returns an iterable is introduced for handling lazy loading. --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
This commit is contained in:
parent
8a54217e7b
commit
2052e70664
@ -1,15 +1,25 @@
|
||||
"""Base loader class."""
|
||||
"""Abstract interface for document loader implementations."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||
|
||||
|
||||
class BaseLoader(ABC):
|
||||
"""Base loader class."""
|
||||
"""Interface for loading documents.
|
||||
|
||||
Implementations should implement the lazy-loading method using generators
|
||||
to avoid loading all documents into memory at once.
|
||||
|
||||
The `load` method will remain as is for backwards compatibility, but it's
|
||||
implementation should be just `list(self.lazy_load())`.
|
||||
"""
|
||||
|
||||
# Sub-classes should implement this method
|
||||
# as return list(self.lazy_load()).
|
||||
# This method returns a List which is materialized in memory.
|
||||
@abstractmethod
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
@ -24,3 +34,13 @@ class BaseLoader(ABC):
|
||||
_text_splitter = text_splitter
|
||||
docs = self.load()
|
||||
return _text_splitter.split_documents(docs)
|
||||
|
||||
# Attention: This method will be upgraded into an abstractmethod once it's
|
||||
# implemented in all the existing subclasses.
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterable[Document]:
|
||||
"""A lazy loader for document content."""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement lazy_load()"
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user