community[patch]: Implement lazy_load() for MHTMLLoader (#18648)

Covered by `tests/unit_tests/document_loaders/test_mhtml.py`
This commit is contained in:
Christophe Bornet 2024-03-07 17:50:18 +01:00 committed by GitHub
parent 9745b5894d
commit 6cd7607816
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,6 @@
import email import email
import logging import logging
from typing import Dict, List, Union from typing import Dict, Iterator, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@ -44,11 +44,11 @@ class MHTMLLoader(BaseLoader):
self.bs_kwargs = bs_kwargs self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator self.get_text_separator = get_text_separator
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
from bs4 import BeautifulSoup
"""Load MHTML document into document objects.""" """Load MHTML document into document objects."""
from bs4 import BeautifulSoup
with open(self.file_path, "r", encoding=self.open_encoding) as f: with open(self.file_path, "r", encoding=self.open_encoding) as f:
message = email.message_from_string(f.read()) message = email.message_from_string(f.read())
parts = message.get_payload() parts = message.get_payload()
@ -72,5 +72,5 @@ class MHTMLLoader(BaseLoader):
"source": self.file_path, "source": self.file_path,
"title": title, "title": title,
} }
return [Document(page_content=text, metadata=metadata)] yield Document(page_content=text, metadata=metadata)
return [] return