mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 06:00:41 +00:00
Fix Html2TextTransformer for shallow copy (#14197)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> Hi, There is some unintended behavior in Html2TextTransformer. The current code is **directly modifying the original documents that are passed as arguments to the function.** Therefore, not only the return of the function but also the input variables are being modified simultaneously. **To resolve this, I added unit test code as well.** reference link: [Shallow vs Deep Copying of Python Objects](https://realpython.com/copying-python-objects/) Thanks! ☺️
This commit is contained in:
parent
818252b1f8
commit
9938086df0
@ -39,9 +39,14 @@ class Html2TextTransformer(BaseDocumentTransformer):
|
||||
h.ignore_links = self.ignore_links
|
||||
h.ignore_images = self.ignore_images
|
||||
|
||||
new_documents = []
|
||||
|
||||
for d in documents:
|
||||
d.page_content = h.handle(d.page_content)
|
||||
return documents
|
||||
new_document = Document(
|
||||
page_content=h.handle(d.page_content), metadata={**d.metadata}
|
||||
)
|
||||
new_documents.append(new_document)
|
||||
return new_documents
|
||||
|
||||
async def atransform_documents(
|
||||
self,
|
||||
|
@ -0,0 +1,111 @@
|
||||
"""Unit tests for html2text document transformer."""
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain.document_transformers import Html2TextTransformer
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_transform_empty_html() -> None:
|
||||
html2text_transformer = Html2TextTransformer()
|
||||
empty_html = "<html></html>"
|
||||
documents = [Document(page_content=empty_html)]
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == "\n\n"
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_extract_paragraphs() -> None:
|
||||
html2text_transformer = Html2TextTransformer()
|
||||
paragraphs_html = (
|
||||
"<html><h1>Header</h1><p>First paragraph.</p>"
|
||||
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
|
||||
)
|
||||
documents = [Document(page_content=paragraphs_html)]
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"# Header\n\n"
|
||||
"First paragraph.\n\n"
|
||||
"Second paragraph.\n\n"
|
||||
"# Ignore at end\n\n"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_extract_html() -> None:
|
||||
html2text_transformer = Html2TextTransformer()
|
||||
paragraphs_html = (
|
||||
"<html>Begin of html tag"
|
||||
"<h1>Header</h1>"
|
||||
"<p>First paragraph.</p>"
|
||||
"Middle of html tag"
|
||||
"<p>Second paragraph.</p>"
|
||||
"End of html tag"
|
||||
"</html>"
|
||||
)
|
||||
documents = [Document(page_content=paragraphs_html)]
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"Begin of html tag\n\n"
|
||||
"# Header\n\n"
|
||||
"First paragraph.\n\n"
|
||||
"Middle of html tag\n\n"
|
||||
"Second paragraph.\n\n"
|
||||
"End of html tag\n\n"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_remove_style() -> None:
|
||||
html2text_transformer = Html2TextTransformer()
|
||||
with_style_html = (
|
||||
"<html><style>my_funky_style</style><p>First paragraph.</p></html>"
|
||||
)
|
||||
documents = [Document(page_content=with_style_html)]
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == "First paragraph.\n\n"
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_ignore_links() -> None:
|
||||
html2text_transformer = Html2TextTransformer(ignore_links=False)
|
||||
multiple_tags_html = (
|
||||
"<h1>First heading.</h1>"
|
||||
"<p>First paragraph with an <a href='http://example.com'>example</a></p>"
|
||||
)
|
||||
documents = [Document(page_content=multiple_tags_html)]
|
||||
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"# First heading.\n\n"
|
||||
"First paragraph with an [example](http://example.com)\n\n"
|
||||
)
|
||||
|
||||
html2text_transformer = Html2TextTransformer(ignore_links=True)
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"# First heading.\n\n" "First paragraph with an example\n\n"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("html2text")
|
||||
def test_ignore_images() -> None:
|
||||
html2text_transformer = Html2TextTransformer(ignore_images=False)
|
||||
multiple_tags_html = (
|
||||
"<h1>First heading.</h1>"
|
||||
"<p>First paragraph with an "
|
||||
"<img src='example.jpg' alt='Example image' width='500' height='600'></p>"
|
||||
)
|
||||
documents = [Document(page_content=multiple_tags_html)]
|
||||
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"# First heading.\n\n"
|
||||
"First paragraph with an \n\n"
|
||||
)
|
||||
|
||||
html2text_transformer = Html2TextTransformer(ignore_images=True)
|
||||
docs_transformed = html2text_transformer.transform_documents(documents)
|
||||
assert docs_transformed[0].page_content == (
|
||||
"# First heading.\n\n" "First paragraph with an\n\n"
|
||||
)
|
Loading…
Reference in New Issue
Block a user