From 6cd7607816f676de62225040a70a3c89f9b6088c Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Thu, 7 Mar 2024 17:50:18 +0100 Subject: [PATCH] community[patch]: Implement lazy_load() for MHTMLLoader (#18648) Covered by `tests/unit_tests/document_loaders/test_mhtml.py` --- .../langchain_community/document_loaders/mhtml.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/mhtml.py b/libs/community/langchain_community/document_loaders/mhtml.py index 75cace9124f..8652ed9e147 100644 --- a/libs/community/langchain_community/document_loaders/mhtml.py +++ b/libs/community/langchain_community/document_loaders/mhtml.py @@ -1,6 +1,6 @@ import email import logging -from typing import Dict, List, Union +from typing import Dict, Iterator, Union from langchain_core.documents import Document @@ -44,11 +44,11 @@ class MHTMLLoader(BaseLoader): self.bs_kwargs = bs_kwargs self.get_text_separator = get_text_separator - def load(self) -> List[Document]: - from bs4 import BeautifulSoup - + def lazy_load(self) -> Iterator[Document]: """Load MHTML document into document objects.""" + from bs4 import BeautifulSoup + with open(self.file_path, "r", encoding=self.open_encoding) as f: message = email.message_from_string(f.read()) parts = message.get_payload() @@ -72,5 +72,5 @@ class MHTMLLoader(BaseLoader): "source": self.file_path, "title": title, } - return [Document(page_content=text, metadata=metadata)] - return [] + yield Document(page_content=text, metadata=metadata) + return