langchain/tests/integration_tests/document_loaders/test_bshtml.py
Daniel Chalef b157e0c1c3
Add HTML document_loader that includes page title metadata (#1720)
This `BSHTMLLoader` document_loader loads an HTML document, extracts
text and adds the page title to the returned Document's metadata. The
loader uses the already installed bs4 package to extract both text
content and the page title.

Included in this PR is an example HTML file and an integration test that
tests against this file.

---------

Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
2023-03-16 21:47:17 -07:00

18 lines
460 B
Python

from pathlib import Path
from langchain.document_loaders.html_bs import BSHTMLLoader
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.html"
loader = BSHTMLLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)