Added a MHTML document loader (#6311)

MHTML is a very interesting format since it's used both for emails but also for archived webpages. Some scraping projects want to store pages in disk to process them later, mhtml is perfect for that use case. This is heavily inspired from the beautifulsoup html loader, but extracting the html part from the mhtml file. --------- Co-authored-by: rlm <pexpresss31@gmail.com>
2025-09-15 22:44:36 +00:00 · 2023-06-25 22:12:08 +02:00
parent 05eec99269
commit 87802c86d9
5 changed files with 275 additions and 0 deletions
--- a/tests/unit_tests/document_loaders/test_mhtml.py
+++ b/tests/unit_tests/document_loaders/test_mhtml.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.mhtml import MHTMLLoader
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_mhtml_loader() -> None:
+    """Test mhtml loader."""
+    file_path = EXAMPLES / "example.mht"
+    loader = MHTMLLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "LangChain"
+    assert metadata["source"] == str(file_path)
+    assert "LANG CHAIN 🦜️🔗Official Home Page" in content