Fix BeautifulSoupTransformer: no more duplicates and correct order of tags + tests (#12596)

This commit is contained in:
Peter Vandenabeele 2023-11-11 09:56:37 +01:00 committed by GitHub
parent 937d7c41f3
commit 7f1964b264
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 187 additions and 39 deletions

View File

@ -1,4 +1,4 @@
from typing import Any, List, Sequence from typing import Any, Iterator, List, Sequence, cast
from langchain.schema import BaseDocumentTransformer, Document from langchain.schema import BaseDocumentTransformer, Document
@ -98,18 +98,15 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
text_parts = [] text_parts: List[str] = []
for tag in tags: for element in soup.find_all():
elements = soup.find_all(tag) if element.name in tags:
for element in elements: # Extract all navigable strings recursively from this element.
if tag == "a": text_parts += get_navigable_strings(element)
href = element.get("href")
if href: # To avoid duplicate text, remove all descendants from the soup.
text_parts.append(f"{element.get_text()} ({href})") element.decompose()
else:
text_parts.append(element.get_text())
else:
text_parts.append(element.get_text())
return " ".join(text_parts) return " ".join(text_parts)
@staticmethod @staticmethod
@ -126,13 +123,7 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
lines = content.split("\n") lines = content.split("\n")
stripped_lines = [line.strip() for line in lines] stripped_lines = [line.strip() for line in lines]
non_empty_lines = [line for line in stripped_lines if line] non_empty_lines = [line for line in stripped_lines if line]
seen = set() cleaned_content = " ".join(non_empty_lines)
deduped_lines = []
for line in non_empty_lines:
if line not in seen:
seen.add(line)
deduped_lines.append(line)
cleaned_content = " ".join(deduped_lines)
return cleaned_content return cleaned_content
async def atransform_documents( async def atransform_documents(
@ -141,3 +132,16 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
**kwargs: Any, **kwargs: Any,
) -> Sequence[Document]: ) -> Sequence[Document]:
raise NotImplementedError raise NotImplementedError
def get_navigable_strings(element: Any) -> Iterator[str]:
from bs4 import NavigableString, Tag
for child in cast(Tag, element).children:
if isinstance(child, Tag):
yield from get_navigable_strings(child)
elif isinstance(child, NavigableString):
if (element.name == "a") and (href := element.get("href")):
yield f"{child.strip()} ({href})"
else:
yield child.strip()

View File

@ -15,14 +15,54 @@ def test_transform_empty_html() -> None:
@pytest.mark.requires("bs4") @pytest.mark.requires("bs4")
def test_extract_paragraph() -> None: def test_extract_paragraphs() -> None:
bs_transformer = BeautifulSoupTransformer() bs_transformer = BeautifulSoupTransformer()
paragraphs_html = "<html><p>First paragraph.</p><p>Second paragraph.</p><html>" paragraphs_html = (
"<html><h1>Header</h1><p>First paragraph.</p>"
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
)
documents = [Document(page_content=paragraphs_html)] documents = [Document(page_content=paragraphs_html)]
docs_transformed = bs_transformer.transform_documents(documents) docs_transformed = bs_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == "First paragraph. Second paragraph." assert docs_transformed[0].page_content == "First paragraph. Second paragraph."
@pytest.mark.requires("bs4")
def test_strip_whitespace() -> None:
bs_transformer = BeautifulSoupTransformer()
paragraphs_html = (
"<html><h1>Header</h1><p><span>First</span> paragraph.</p>"
"<p>Second paragraph. </p></html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = bs_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == "First paragraph. Second paragraph."
@pytest.mark.requires("bs4")
def test_extract_html() -> None:
bs_transformer = BeautifulSoupTransformer()
paragraphs_html = (
"<html>Begin of html tag"
"<h1>Header</h1>"
"<p>First paragraph.</p>"
"Middle of html tag"
"<p>Second paragraph.</p>"
"End of html tag"
"</html>"
)
documents = [Document(page_content=paragraphs_html)]
docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["html", "p"]
)
assert docs_transformed[0].page_content == (
"Begin of html tag "
"Header First paragraph. "
"Middle of html tag "
"Second paragraph. "
"End of html tag"
)
@pytest.mark.requires("bs4") @pytest.mark.requires("bs4")
def test_remove_style() -> None: def test_remove_style() -> None:
bs_transformer = BeautifulSoupTransformer() bs_transformer = BeautifulSoupTransformer()
@ -30,21 +70,97 @@ def test_remove_style() -> None:
"<html><style>my_funky_style</style><p>First paragraph.</p></html>" "<html><style>my_funky_style</style><p>First paragraph.</p></html>"
) )
documents = [Document(page_content=with_style_html)] documents = [Document(page_content=with_style_html)]
docs_transformed = bs_transformer.transform_documents(documents) docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["html"]
)
assert docs_transformed[0].page_content == "First paragraph." assert docs_transformed[0].page_content == "First paragraph."
@pytest.mark.requires("bs4")
def test_remove_nested_tags() -> None:
"""
If a tag_to_extract is _inside_ an unwanted_tag, it should be removed
(e.g. a <p> inside a <table> if <table> is unwanted).)
If an unwanted tag is _inside_ a tag_to_extract, it should be removed,
but the rest of the tag_to_extract should stay.
This means that "unwanted_tags" have a higher "priority" than "tags_to_extract".
"""
bs_transformer = BeautifulSoupTransformer()
with_style_html = (
"<html><style>my_funky_style</style>"
"<table><td><p>First paragraph, inside a table.</p></td></table>"
"<p>Second paragraph<table><td> with a cell </td></table>.</p>"
"</html>"
)
documents = [Document(page_content=with_style_html)]
docs_transformed = bs_transformer.transform_documents(
documents, unwanted_tags=["script", "style", "table"]
)
assert docs_transformed[0].page_content == "Second paragraph."
@pytest.mark.requires("bs4") @pytest.mark.requires("bs4")
def test_remove_unwanted_lines() -> None: def test_remove_unwanted_lines() -> None:
bs_transformer = BeautifulSoupTransformer() bs_transformer = BeautifulSoupTransformer()
with_lines_html = "<html>\n\n<p>First \n\n paragraph.</p>\n</html>\n\n" with_lines_html = "<html>\n\n<p>First \n\n paragraph.</p>\n</html>\n\n"
documents = [Document(page_content=with_lines_html)] documents = [Document(page_content=with_lines_html)]
docs_transformed = bs_transformer.transform_documents(documents) docs_transformed = bs_transformer.transform_documents(documents, remove_lines=True)
assert docs_transformed[0].page_content == "First paragraph." assert docs_transformed[0].page_content == "First paragraph."
# FIXME: This test proves that the order of the tags is NOT preserved. @pytest.mark.requires("bs4")
# Documenting the current behavior here, but this should be fixed. def test_do_not_remove_repeated_content() -> None:
bs_transformer = BeautifulSoupTransformer()
with_lines_html = "<p>1\n1\n1\n1</p>"
documents = [Document(page_content=with_lines_html)]
docs_transformed = bs_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == "1 1 1 1"
@pytest.mark.requires("bs4")
def test_extract_nested_tags() -> None:
bs_transformer = BeautifulSoupTransformer()
nested_html = (
"<html><div class='some_style'>"
"<p><span>First</span> paragraph.</p>"
"<p>Second <div>paragraph.</div></p>"
"<p><p>Third paragraph.</p></p>"
"</div></html>"
)
documents = [Document(page_content=nested_html)]
docs_transformed = bs_transformer.transform_documents(documents)
assert (
docs_transformed[0].page_content
== "First paragraph. Second paragraph. Third paragraph."
)
@pytest.mark.requires("bs4")
def test_extract_more_nested_tags() -> None:
bs_transformer = BeautifulSoupTransformer()
nested_html = (
"<html><div class='some_style'>"
"<p><span>First</span> paragraph.</p>"
"<p>Second paragraph.</p>"
"<p>Third paragraph with a list:"
"<ul>"
"<li>First list item.</li>"
"<li>Second list item.</li>"
"</ul>"
"</p>"
"<p>Fourth paragraph.</p>"
"</div></html>"
)
documents = [Document(page_content=nested_html)]
docs_transformed = bs_transformer.transform_documents(documents)
assert docs_transformed[0].page_content == (
"First paragraph. Second paragraph. "
"Third paragraph with a list: "
"First list item. Second list item. "
"Fourth paragraph."
)
@pytest.mark.requires("bs4") @pytest.mark.requires("bs4")
def test_transform_keeps_order() -> None: def test_transform_keeps_order() -> None:
bs_transformer = BeautifulSoupTransformer() bs_transformer = BeautifulSoupTransformer()
@ -56,33 +172,61 @@ def test_transform_keeps_order() -> None:
) )
documents = [Document(page_content=multiple_tags_html)] documents = [Document(page_content=multiple_tags_html)]
# order of "p" and "h1" in the "tags_to_extract" parameter is important here: # Order of "p" and "h1" in the "tags_to_extract" parameter is NOT important here:
# it will first extract all "p" tags, then all "h1" tags, breaking the order # it will keep the order of the original HTML.
# of the HTML.
docs_transformed_p_then_h1 = bs_transformer.transform_documents( docs_transformed_p_then_h1 = bs_transformer.transform_documents(
documents, tags_to_extract=["p", "h1"] documents, tags_to_extract=["p", "h1"]
) )
assert ( assert (
docs_transformed_p_then_h1[0].page_content docs_transformed_p_then_h1[0].page_content
== "First paragraph. Second paragraph. First heading. Second heading." == "First heading. First paragraph. Second heading. Second paragraph."
) )
# Recreating `documents` because transform_documents() modifies it. # Recreating `documents` because transform_documents() modifies it.
documents = [Document(page_content=multiple_tags_html)] documents = [Document(page_content=multiple_tags_html)]
# changing the order of "h1" and "p" in "tags_to_extract" flips the order of # changing the order of "h1" and "p" in "tags_to_extract" does NOT flip the order
# the extracted tags: # of the extracted tags:
docs_transformed_h1_then_p = bs_transformer.transform_documents( docs_transformed_h1_then_p = bs_transformer.transform_documents(
documents, tags_to_extract=["h1", "p"] documents, tags_to_extract=["h1", "p"]
) )
assert ( assert (
docs_transformed_h1_then_p[0].page_content docs_transformed_h1_then_p[0].page_content
== "First heading. Second heading. First paragraph. Second paragraph." == "First heading. First paragraph. Second heading. Second paragraph."
) )
# The correct result should be:
# @pytest.mark.requires("bs4")
# "First heading. First paragraph. Second heading. Second paragraph." def test_extracts_href() -> None:
# bs_transformer = BeautifulSoupTransformer()
# That is the order in the original HTML, that should be preserved to preserve multiple_tags_html = (
# the semantic "meaning" of the text. "<h1>First heading.</h1>"
"<p>First paragraph with an <a href='http://example.com'>example</a></p>"
"<p>Second paragraph with an <a>a tag without href</a></p>"
)
documents = [Document(page_content=multiple_tags_html)]
docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["p"]
)
assert docs_transformed[0].page_content == (
"First paragraph with an example (http://example.com) "
"Second paragraph with an a tag without href"
)
@pytest.mark.requires("bs4")
def test_invalid_html() -> None:
bs_transformer = BeautifulSoupTransformer()
invalid_html_1 = "<html><h1>First heading."
invalid_html_2 = "<html 1234 xyz"
documents = [
Document(page_content=invalid_html_1),
Document(page_content=invalid_html_2),
]
docs_transformed = bs_transformer.transform_documents(
documents, tags_to_extract=["h1"]
)
assert docs_transformed[0].page_content == "First heading."
assert docs_transformed[1].page_content == ""