mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-23 16:08:10 +00:00
This `BSHTMLLoader` document_loader loads an HTML document, extracts text and adds the page title to the returned Document's metadata. The loader uses the already installed bs4 package to extract both text content and the page title. Included in this PR is an example HTML file and an integration test that tests against this file. --------- Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
18 lines
460 B
Python
18 lines
460 B
Python
from pathlib import Path
|
|
|
|
from langchain.document_loaders.html_bs import BSHTMLLoader
|
|
|
|
|
|
def test_bs_html_loader() -> None:
|
|
"""Test unstructured loader."""
|
|
file_path = Path(__file__).parent.parent / "examples/example.html"
|
|
loader = BSHTMLLoader(str(file_path))
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
metadata = docs[0].metadata
|
|
|
|
assert metadata["title"] == "Chew dad's slippers"
|
|
assert metadata["source"] == str(file_path)
|