diff --git a/libs/community/langchain_community/document_transformers/markdownify.py b/libs/community/langchain_community/document_transformers/markdownify.py index cb8108deeaa..91c580e591d 100644 --- a/libs/community/langchain_community/document_transformers/markdownify.py +++ b/libs/community/langchain_community/document_transformers/markdownify.py @@ -74,10 +74,3 @@ class MarkdownifyTransformer(BaseDocumentTransformer): ) return converted_documents - - async def atransform_documents( - self, - documents: Sequence[Document], - **kwargs: Any, - ) -> Sequence[Document]: - raise NotImplementedError diff --git a/libs/community/tests/unit_tests/document_transformers/test_markdownify.py b/libs/community/tests/unit_tests/document_transformers/test_markdownify.py index 32ae9dda691..1ce407289dd 100644 --- a/libs/community/tests/unit_tests/document_transformers/test_markdownify.py +++ b/libs/community/tests/unit_tests/document_transformers/test_markdownify.py @@ -50,7 +50,8 @@ def test_extract_html() -> None: documents = [Document(page_content=basic_html)] docs_transformed = markdownify.transform_documents(documents) assert docs_transformed[0].page_content == ( - "Simple Test Page # Test Header\n\n " + "Simple Test Page " + "# Test Header\n\n " "First paragraph.\n\n " "Second paragraph.\n\n " "[Example Link](https://example.com)" @@ -105,7 +106,8 @@ def test_convert_tags() -> None: assert docs_transformed[0].page_content == ( "Header " "**1st paragraph.** " - "2nd paragraph. Here is [link](http://example.com) " + "2nd paragraph. " + "Here is [link](http://example.com) " "Ignore at end" ) @@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None: ) documents = [Document(page_content=paragraphs_html)] markdownify.transform_documents(documents) + + +# Async variants: exact duplicates of the above functions, using atransform_documents() +@pytest.mark.requires("markdownify") +async def test_empty_html_async() -> None: + markdownify = MarkdownifyTransformer() + empty_html = "" + documents = [Document(page_content=empty_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == "" + + +@pytest.mark.requires("markdownify") +async def test_extract_paragraphs_async() -> None: + markdownify = MarkdownifyTransformer() + paragraphs_html = ( + "
First paragraph.
" + "Second paragraph.
First paragraph.
" + "Second paragraph.
" + ' Example Link' + "" + "" + ) + documents = [Document(page_content=basic_html)] + docs_transformed = await markdownify.atransform_documents(documents) + assert docs_transformed[0].page_content == ( + "Simple Test Page " + "# Test Header\n\n " + "First paragraph.\n\n " + "Second paragraph.\n\n " + "[Example Link](https://example.com)" + ) + + +@pytest.mark.requires("markdownify") +async def test_strip_tags_async() -> None: + markdownify = MarkdownifyTransformer(strip="strong") + paragraphs_html = ( + "" + "1st paragraph.
" + '2nd paragraph. Here is link
' + '1st paragraph.
" + '2nd paragraph. Here is link
' + '1st paragraph.
" + '2nd paragraph. Here is link
' + '