diff --git a/libs/community/langchain_community/document_transformers/markdownify.py b/libs/community/langchain_community/document_transformers/markdownify.py index cb8108deeaa..91c580e591d 100644 --- a/libs/community/langchain_community/document_transformers/markdownify.py +++ b/libs/community/langchain_community/document_transformers/markdownify.py @@ -74,10 +74,3 @@ class MarkdownifyTransformer(BaseDocumentTransformer): ) return converted_documents - - async def atransform_documents( - self, - documents: Sequence[Document], - **kwargs: Any, - ) -> Sequence[Document]: - raise NotImplementedError diff --git a/libs/community/tests/unit_tests/document_transformers/test_markdownify.py b/libs/community/tests/unit_tests/document_transformers/test_markdownify.py index 32ae9dda691..1ce407289dd 100644 --- a/libs/community/tests/unit_tests/document_transformers/test_markdownify.py +++ b/libs/community/tests/unit_tests/document_transformers/test_markdownify.py @@ -50,7 +50,8 @@ def test_extract_html() -> None: documents = [Document(page_content=basic_html)] docs_transformed = markdownify.transform_documents(documents) assert docs_transformed[0].page_content == ( - "Simple Test Page # Test Header\n\n " + "Simple Test Page " + "# Test Header\n\n " "First paragraph.\n\n " "Second paragraph.\n\n " "[Example Link](https://example.com)" @@ -105,7 +106,8 @@ def test_convert_tags() -> None: assert docs_transformed[0].page_content == ( "Header " "**1st paragraph.** " - "2nd paragraph. Here is [link](http://example.com) " + "2nd paragraph. " + "Here is [link](http://example.com) " "Ignore at end" ) @@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None: ) documents = [Document(page_content=paragraphs_html)] markdownify.transform_documents(documents) + + +# Async variants: exact duplicates of the above functions, using atransform_documents() +@pytest.mark.requires("markdownify") +async def test_empty_html_async() -> None: + markdownify = MarkdownifyTransformer() + empty_html = "" + documents = [Document(page_content=empty_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == "" + + +@pytest.mark.requires("markdownify") +async def test_extract_paragraphs_async() -> None: + markdownify = MarkdownifyTransformer() + paragraphs_html = ( + "

Header

First paragraph.

" + "

Second paragraph.

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end" + ) + + +@pytest.mark.requires("markdownify") +async def test_extract_html_async() -> None: + markdownify = MarkdownifyTransformer(skip="title") + basic_html = ( + "" + '' + "" + ' ' + " Simple Test Page" + "" + "" + "

Test Header

" + "

First paragraph.

" + "

Second paragraph.

" + ' Example Link' + "" + "" + ) + documents = [Document(page_content=basic_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "Simple Test Page " + "# Test Header\n\n " + "First paragraph.\n\n " + "Second paragraph.\n\n " + "[Example Link](https://example.com)" + ) + + +@pytest.mark.requires("markdownify") +async def test_strip_tags_async() -> None: + markdownify = MarkdownifyTransformer(strip="strong") + paragraphs_html = ( + "" + "

Header

" + "

1st paragraph.

" + '

2nd paragraph. Here is link

' + ' Sample Image' + "

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "# Header\n\n " + "1st paragraph.\n\n " + "2nd paragraph. Here is [link](http://example.com)\n\n " + "![Sample Image](image.jpg)" + "# Ignore at end" + ) + + markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"]) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "# Header\n\n " + "1st paragraph.\n\n " + "2nd paragraph. Here is link\n\n " + "# Ignore at end" + ) + + +@pytest.mark.requires("markdownify") +async def test_convert_tags_async() -> None: + markdownify = MarkdownifyTransformer(convert=["strong", "a"]) + paragraphs_html = ( + "" + "

Header

" + "

1st paragraph.

" + '

2nd paragraph. Here is link

' + ' Sample Image' + "

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "Header " + "**1st paragraph.** " + "2nd paragraph. " + "Here is [link](http://example.com) " + "Ignore at end" + ) + + markdownify = MarkdownifyTransformer(convert="p") + documents = [Document(page_content=paragraphs_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "Header " + "1st paragraph.\n\n " + "2nd paragraph. Here is link\n\n " + "Ignore at end" + ) + + +@pytest.mark.requires("markdownify") +async def test_strip_convert_conflict_error_async() -> None: + with pytest.raises( + ValueError, + match="You may specify either tags to strip or tags to convert, but not both.", + ): + markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"]) + paragraphs_html = ( + "" + "

Header

" + "

1st paragraph.

" + '

2nd paragraph. Here is link

' + ' Sample Image' + "

Ignore at end

" + ) + documents = [Document(page_content=paragraphs_html)] + await markdownify.atransform_documents(documents)