mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
community: fallback on core async atransform_documents method for MarkdownifyTransformer
(#27866)
# Description Implements the `atransform_documents` method for `MarkdownifyTransformer` using the `asyncio` built-in library for concurrency. Note that this is mainly for API completeness when working with async frameworks rather than for performance, since the `markdownify` function is not I/O bound because it works with `Document` objects already in memory. # Issue Fixes #27865 # Dependencies No new dependencies added, but [`markdownify`](https://github.com/matthewwithanm/python-markdownify) is required since this PR updates the `markdownify` integration. # Tests and docs - Tests added - I did not modify the docstrings since they already described the basic functionality, and [the API docs also already included a description](https://python.langchain.com/api_reference/community/document_transformers/langchain_community.document_transformers.markdownify.MarkdownifyTransformer.html#langchain_community.document_transformers.markdownify.MarkdownifyTransformer.atransform_documents). If it would be helpful, I would be happy to update the docstrings and/or the API docs. # Lint and test - [x] format - [x] lint - [x] test I ran formatting with `make format`, linting with `make lint`, and confirmed that tests pass using `make test`. Note that some unit tests pass in CI but may fail when running `make_test`. Those unit tests are: - `test_extract_html` (and `test_extract_html_async`) - `test_strip_tags` (and `test_strip_tags_async`) - `test_convert_tags` (and `test_convert_tags_async`) The reason for the difference is that there are trailing spaces when the tests are run in the CI checks, and no trailing spaces when run with `make test`. I ensured that the tests pass in CI, but they may fail with `make test` due to the addition of trailing spaces. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
af2e0a7ede
commit
12111cb922
@ -74,10 +74,3 @@ class MarkdownifyTransformer(BaseDocumentTransformer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return converted_documents
|
return converted_documents
|
||||||
|
|
||||||
async def atransform_documents(
|
|
||||||
self,
|
|
||||||
documents: Sequence[Document],
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> Sequence[Document]:
|
|
||||||
raise NotImplementedError
|
|
||||||
|
@ -50,7 +50,8 @@ def test_extract_html() -> None:
|
|||||||
documents = [Document(page_content=basic_html)]
|
documents = [Document(page_content=basic_html)]
|
||||||
docs_transformed = markdownify.transform_documents(documents)
|
docs_transformed = markdownify.transform_documents(documents)
|
||||||
assert docs_transformed[0].page_content == (
|
assert docs_transformed[0].page_content == (
|
||||||
"Simple Test Page # Test Header\n\n "
|
"Simple Test Page "
|
||||||
|
"# Test Header\n\n "
|
||||||
"First paragraph.\n\n "
|
"First paragraph.\n\n "
|
||||||
"Second paragraph.\n\n "
|
"Second paragraph.\n\n "
|
||||||
"[Example Link](https://example.com)"
|
"[Example Link](https://example.com)"
|
||||||
@ -105,7 +106,8 @@ def test_convert_tags() -> None:
|
|||||||
assert docs_transformed[0].page_content == (
|
assert docs_transformed[0].page_content == (
|
||||||
"Header "
|
"Header "
|
||||||
"**1st paragraph.** "
|
"**1st paragraph.** "
|
||||||
"2nd paragraph. Here is [link](http://example.com) "
|
"2nd paragraph. "
|
||||||
|
"Here is [link](http://example.com) "
|
||||||
"Ignore at end"
|
"Ignore at end"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -137,3 +139,139 @@ def test_strip_convert_conflict_error() -> None:
|
|||||||
)
|
)
|
||||||
documents = [Document(page_content=paragraphs_html)]
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
markdownify.transform_documents(documents)
|
markdownify.transform_documents(documents)
|
||||||
|
|
||||||
|
|
||||||
|
# Async variants: exact duplicates of the above functions, using atransform_documents()
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_empty_html_async() -> None:
|
||||||
|
markdownify = MarkdownifyTransformer()
|
||||||
|
empty_html = "<html></html>"
|
||||||
|
documents = [Document(page_content=empty_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_extract_paragraphs_async() -> None:
|
||||||
|
markdownify = MarkdownifyTransformer()
|
||||||
|
paragraphs_html = (
|
||||||
|
"<html><h1>Header</h1><p>First paragraph.</p>"
|
||||||
|
"<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
|
||||||
|
)
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_extract_html_async() -> None:
|
||||||
|
markdownify = MarkdownifyTransformer(skip="title")
|
||||||
|
basic_html = (
|
||||||
|
"<!DOCTYPE html>"
|
||||||
|
'<html lang="en">'
|
||||||
|
"<head>"
|
||||||
|
' <meta charset="UTF-8">'
|
||||||
|
" <title>Simple Test Page</title>"
|
||||||
|
"</head>"
|
||||||
|
"<body>"
|
||||||
|
" <h1>Test Header</h1>"
|
||||||
|
" <p>First paragraph.</p>"
|
||||||
|
" <p>Second paragraph.</p>"
|
||||||
|
' <a href="https://example.com">Example Link</a>'
|
||||||
|
"</body>"
|
||||||
|
"</html>"
|
||||||
|
)
|
||||||
|
documents = [Document(page_content=basic_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"Simple Test Page "
|
||||||
|
"# Test Header\n\n "
|
||||||
|
"First paragraph.\n\n "
|
||||||
|
"Second paragraph.\n\n "
|
||||||
|
"[Example Link](https://example.com)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_strip_tags_async() -> None:
|
||||||
|
markdownify = MarkdownifyTransformer(strip="strong")
|
||||||
|
paragraphs_html = (
|
||||||
|
"<html>"
|
||||||
|
"<h1>Header</h1>"
|
||||||
|
" <p><strong>1st paragraph.</strong></p>"
|
||||||
|
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
|
||||||
|
' <img src="image.jpg" alt="Sample Image">'
|
||||||
|
"<h1>Ignore at end</h1></html>"
|
||||||
|
)
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"# Header\n\n "
|
||||||
|
"1st paragraph.\n\n "
|
||||||
|
"2nd paragraph. Here is [link](http://example.com)\n\n "
|
||||||
|
""
|
||||||
|
"# Ignore at end"
|
||||||
|
)
|
||||||
|
|
||||||
|
markdownify = MarkdownifyTransformer(strip=["strong", "a", "img"])
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"# Header\n\n "
|
||||||
|
"1st paragraph.\n\n "
|
||||||
|
"2nd paragraph. Here is link\n\n "
|
||||||
|
"# Ignore at end"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_convert_tags_async() -> None:
|
||||||
|
markdownify = MarkdownifyTransformer(convert=["strong", "a"])
|
||||||
|
paragraphs_html = (
|
||||||
|
"<html>"
|
||||||
|
"<h1>Header</h1>"
|
||||||
|
" <p><strong>1st paragraph.</strong></p>"
|
||||||
|
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
|
||||||
|
' <img src="image.jpg" alt="Sample Image">'
|
||||||
|
"<h1>Ignore at end</h1></html>"
|
||||||
|
)
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"Header "
|
||||||
|
"**1st paragraph.** "
|
||||||
|
"2nd paragraph. "
|
||||||
|
"Here is [link](http://example.com) "
|
||||||
|
"Ignore at end"
|
||||||
|
)
|
||||||
|
|
||||||
|
markdownify = MarkdownifyTransformer(convert="p")
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
docs_transformed = await markdownify.atransform_documents(documents)
|
||||||
|
assert docs_transformed[0].page_content == (
|
||||||
|
"Header "
|
||||||
|
"1st paragraph.\n\n "
|
||||||
|
"2nd paragraph. Here is link\n\n "
|
||||||
|
"Ignore at end"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
async def test_strip_convert_conflict_error_async() -> None:
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match="You may specify either tags to strip or tags to convert, but not both.",
|
||||||
|
):
|
||||||
|
markdownify = MarkdownifyTransformer(strip="h1", convert=["strong", "a"])
|
||||||
|
paragraphs_html = (
|
||||||
|
"<html>"
|
||||||
|
"<h1>Header</h1>"
|
||||||
|
" <p><strong>1st paragraph.</strong></p>"
|
||||||
|
' <p>2nd paragraph. Here is <a href="http://example.com">link</a></p>'
|
||||||
|
' <img src="image.jpg" alt="Sample Image">'
|
||||||
|
"<h1>Ignore at end</h1></html>"
|
||||||
|
)
|
||||||
|
documents = [Document(page_content=paragraphs_html)]
|
||||||
|
await markdownify.atransform_documents(documents)
|
||||||
|
Loading…
Reference in New Issue
Block a user