Add get_text_separator parameter to BSHTMLLoader (#3551)

By default get_text doesn't separate content of different HTML tag.
Adding option for specifying separator helps with document splitting.
This commit is contained in:
Maciej Bryński 2023-04-27 01:10:16 +02:00 committed by GitHub
parent 568c4f0d81
commit aa345a4bb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 2 deletions

View File

@ -17,6 +17,7 @@ class BSHTMLLoader(BaseLoader):
file_path: str, file_path: str,
open_encoding: Union[str, None] = None, open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None, bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
) -> None: ) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs """Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.""" to pass to the BeautifulSoup object."""
@ -33,6 +34,7 @@ class BSHTMLLoader(BaseLoader):
if bs_kwargs is None: if bs_kwargs is None:
bs_kwargs = {"features": "lxml"} bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator
def load(self) -> List[Document]: def load(self) -> List[Document]:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -41,7 +43,7 @@ class BSHTMLLoader(BaseLoader):
with open(self.file_path, "r", encoding=self.open_encoding) as f: with open(self.file_path, "r", encoding=self.open_encoding) as f:
soup = BeautifulSoup(f, **self.bs_kwargs) soup = BeautifulSoup(f, **self.bs_kwargs)
text = soup.get_text() text = soup.get_text(self.get_text_separator)
if soup.title: if soup.title:
title = str(soup.title.string) title = str(soup.title.string)

View File

@ -9,15 +9,17 @@ from langchain.document_loaders.html_bs import BSHTMLLoader
def test_bs_html_loader() -> None: def test_bs_html_loader() -> None:
"""Test unstructured loader.""" """Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.html" file_path = Path(__file__).parent.parent / "examples/example.html"
loader = BSHTMLLoader(str(file_path)) loader = BSHTMLLoader(str(file_path), get_text_separator="|")
docs = loader.load() docs = loader.load()
assert len(docs) == 1 assert len(docs) == 1
metadata = docs[0].metadata metadata = docs[0].metadata
content = docs[0].page_content
assert metadata["title"] == "Chew dad's slippers" assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path) assert metadata["source"] == str(file_path)
assert content[:2] == "\n|"
@pytest.mark.skipif( @pytest.mark.skipif(