mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-10 17:46:15 +00:00
community[patch]: Implement lazy_load() for MHTMLLoader (#18648)
Covered by `tests/unit_tests/document_loaders/test_mhtml.py`
This commit is contained in:
parent
9745b5894d
commit
6cd7607816
@ -1,6 +1,6 @@
|
|||||||
import email
|
import email
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, List, Union
|
from typing import Dict, Iterator, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -44,11 +44,11 @@ class MHTMLLoader(BaseLoader):
|
|||||||
self.bs_kwargs = bs_kwargs
|
self.bs_kwargs = bs_kwargs
|
||||||
self.get_text_separator = get_text_separator
|
self.get_text_separator = get_text_separator
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
"""Load MHTML document into document objects."""
|
"""Load MHTML document into document objects."""
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
with open(self.file_path, "r", encoding=self.open_encoding) as f:
|
with open(self.file_path, "r", encoding=self.open_encoding) as f:
|
||||||
message = email.message_from_string(f.read())
|
message = email.message_from_string(f.read())
|
||||||
parts = message.get_payload()
|
parts = message.get_payload()
|
||||||
@ -72,5 +72,5 @@ class MHTMLLoader(BaseLoader):
|
|||||||
"source": self.file_path,
|
"source": self.file_path,
|
||||||
"title": title,
|
"title": title,
|
||||||
}
|
}
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
yield Document(page_content=text, metadata=metadata)
|
||||||
return []
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user