diff --git a/libs/community/langchain_community/document_loaders/mhtml.py b/libs/community/langchain_community/document_loaders/mhtml.py index 75cace9124f..8652ed9e147 100644 --- a/libs/community/langchain_community/document_loaders/mhtml.py +++ b/libs/community/langchain_community/document_loaders/mhtml.py @@ -1,6 +1,6 @@ import email import logging -from typing import Dict, List, Union +from typing import Dict, Iterator, Union from langchain_core.documents import Document @@ -44,11 +44,11 @@ class MHTMLLoader(BaseLoader): self.bs_kwargs = bs_kwargs self.get_text_separator = get_text_separator - def load(self) -> List[Document]: - from bs4 import BeautifulSoup - + def lazy_load(self) -> Iterator[Document]: """Load MHTML document into document objects.""" + from bs4 import BeautifulSoup + with open(self.file_path, "r", encoding=self.open_encoding) as f: message = email.message_from_string(f.read()) parts = message.get_payload() @@ -72,5 +72,5 @@ class MHTMLLoader(BaseLoader): "source": self.file_path, "title": title, } - return [Document(page_content=text, metadata=metadata)] - return [] + yield Document(page_content=text, metadata=metadata) + return