diff --git a/libs/langchain/langchain/document_transformers/html2text.py b/libs/langchain/langchain/document_transformers/html2text.py index 4d59e73ab2e..8329d14b95e 100644 --- a/libs/langchain/langchain/document_transformers/html2text.py +++ b/libs/langchain/langchain/document_transformers/html2text.py @@ -39,9 +39,14 @@ class Html2TextTransformer(BaseDocumentTransformer): h.ignore_links = self.ignore_links h.ignore_images = self.ignore_images + new_documents = [] + for d in documents: - d.page_content = h.handle(d.page_content) - return documents + new_document = Document( + page_content=h.handle(d.page_content), metadata={**d.metadata} + ) + new_documents.append(new_document) + return new_documents async def atransform_documents( self, diff --git a/libs/langchain/tests/unit_tests/document_transformers/test_html2text_transformer.py b/libs/langchain/tests/unit_tests/document_transformers/test_html2text_transformer.py new file mode 100644 index 00000000000..eba0b8b79e5 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_transformers/test_html2text_transformer.py @@ -0,0 +1,111 @@ +"""Unit tests for html2text document transformer.""" +import pytest +from langchain_core.documents import Document + +from langchain.document_transformers import Html2TextTransformer + + +@pytest.mark.requires("html2text") +def test_transform_empty_html() -> None: + html2text_transformer = Html2TextTransformer() + empty_html = "" + documents = [Document(page_content=empty_html)] + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "\n\n" + + +@pytest.mark.requires("html2text") +def test_extract_paragraphs() -> None: + html2text_transformer = Html2TextTransformer() + paragraphs_html = ( + "

Header

First paragraph.

" + "

Second paragraph.

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "# Header\n\n" + "First paragraph.\n\n" + "Second paragraph.\n\n" + "# Ignore at end\n\n" + ) + + +@pytest.mark.requires("html2text") +def test_extract_html() -> None: + html2text_transformer = Html2TextTransformer() + paragraphs_html = ( + "Begin of html tag" + "

Header

" + "

First paragraph.

" + "Middle of html tag" + "

Second paragraph.

" + "End of html tag" + "" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "Begin of html tag\n\n" + "# Header\n\n" + "First paragraph.\n\n" + "Middle of html tag\n\n" + "Second paragraph.\n\n" + "End of html tag\n\n" + ) + + +@pytest.mark.requires("html2text") +def test_remove_style() -> None: + html2text_transformer = Html2TextTransformer() + with_style_html = ( + "

First paragraph.

" + ) + documents = [Document(page_content=with_style_html)] + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "First paragraph.\n\n" + + +@pytest.mark.requires("html2text") +def test_ignore_links() -> None: + html2text_transformer = Html2TextTransformer(ignore_links=False) + multiple_tags_html = ( + "

First heading.

" + "

First paragraph with an example

" + ) + documents = [Document(page_content=multiple_tags_html)] + + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "# First heading.\n\n" + "First paragraph with an [example](http://example.com)\n\n" + ) + + html2text_transformer = Html2TextTransformer(ignore_links=True) + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "# First heading.\n\n" "First paragraph with an example\n\n" + ) + + +@pytest.mark.requires("html2text") +def test_ignore_images() -> None: + html2text_transformer = Html2TextTransformer(ignore_images=False) + multiple_tags_html = ( + "

First heading.

" + "

First paragraph with an " + "Example image

" + ) + documents = [Document(page_content=multiple_tags_html)] + + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "# First heading.\n\n" + "First paragraph with an ![Example image](example.jpg)\n\n" + ) + + html2text_transformer = Html2TextTransformer(ignore_images=True) + docs_transformed = html2text_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == ( + "# First heading.\n\n" "First paragraph with an\n\n" + )