mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-15 22:44:36 +00:00
Added a MHTML document loader (#6311)
MHTML is a very interesting format since it's used both for emails but also for archived webpages. Some scraping projects want to store pages in disk to process them later, mhtml is perfect for that use case. This is heavily inspired from the beautifulsoup html loader, but extracting the html part from the mhtml file. --------- Co-authored-by: rlm <pexpresss31@gmail.com>
This commit is contained in:
committed by
GitHub
parent
05eec99269
commit
87802c86d9
25
tests/unit_tests/document_loaders/test_mhtml.py
Normal file
25
tests/unit_tests/document_loaders/test_mhtml.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.mhtml import MHTMLLoader
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4", "lxml")
|
||||
def test_mhtml_loader() -> None:
|
||||
"""Test mhtml loader."""
|
||||
file_path = EXAMPLES / "example.mht"
|
||||
loader = MHTMLLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
content = docs[0].page_content
|
||||
|
||||
assert metadata["title"] == "LangChain"
|
||||
assert metadata["source"] == str(file_path)
|
||||
assert "LANG CHAIN 🦜️🔗Official Home Page" in content
|
Reference in New Issue
Block a user