mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 13:54:48 +00:00
# Description Implements the `atransform_documents` method for `MarkdownifyTransformer` using the `asyncio` built-in library for concurrency. Note that this is mainly for API completeness when working with async frameworks rather than for performance, since the `markdownify` function is not I/O bound because it works with `Document` objects already in memory. # Issue Fixes #27865 # Dependencies No new dependencies added, but [`markdownify`](https://github.com/matthewwithanm/python-markdownify) is required since this PR updates the `markdownify` integration. # Tests and docs - Tests added - I did not modify the docstrings since they already described the basic functionality, and [the API docs also already included a description](https://python.langchain.com/api_reference/community/document_transformers/langchain_community.document_transformers.markdownify.MarkdownifyTransformer.html#langchain_community.document_transformers.markdownify.MarkdownifyTransformer.atransform_documents). If it would be helpful, I would be happy to update the docstrings and/or the API docs. # Lint and test - [x] format - [x] lint - [x] test I ran formatting with `make format`, linting with `make lint`, and confirmed that tests pass using `make test`. Note that some unit tests pass in CI but may fail when running `make_test`. Those unit tests are: - `test_extract_html` (and `test_extract_html_async`) - `test_strip_tags` (and `test_strip_tags_async`) - `test_convert_tags` (and `test_convert_tags_async`) The reason for the difference is that there are trailing spaces when the tests are run in the CI checks, and no trailing spaces when run with `make test`. I ensured that the tests pass in CI, but they may fail with `make test` due to the addition of trailing spaces. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
77 lines
2.9 KiB
Python
77 lines
2.9 KiB
Python
import re
|
|
from typing import Any, List, Optional, Sequence, Union
|
|
|
|
from langchain_core.documents import BaseDocumentTransformer, Document
|
|
|
|
|
|
class MarkdownifyTransformer(BaseDocumentTransformer):
|
|
"""Converts HTML documents to Markdown format with customizable options for handling
|
|
links, images, other tags and heading styles using the markdownify library.
|
|
|
|
Arguments:
|
|
strip: A list of tags to strip. This option can't be used with the convert option.
|
|
convert: A list of tags to convert. This option can't be used with the strip option.
|
|
autolinks: A boolean indicating whether the "automatic link" style should be used when a a tag's contents match its href. Defaults to True.
|
|
heading_style: Defines how headings should be converted. Accepted values are ATX, ATX_CLOSED, SETEXT, and UNDERLINED (which is an alias for SETEXT). Defaults to ATX.
|
|
kwargs: Additional options to pass to markdownify.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
from langchain_community.document_transformers import MarkdownifyTransformer
|
|
markdownify = MarkdownifyTransformer()
|
|
docs_transform = markdownify.transform_documents(docs)
|
|
|
|
More configuration options can be found at the markdownify GitHub page:
|
|
https://github.com/matthewwithanm/python-markdownify
|
|
""" # noqa: E501
|
|
|
|
def __init__(
|
|
self,
|
|
strip: Optional[Union[str, List[str]]] = None,
|
|
convert: Optional[Union[str, List[str]]] = None,
|
|
autolinks: bool = True,
|
|
heading_style: str = "ATX",
|
|
**kwargs: Any,
|
|
) -> None:
|
|
self.strip = [strip] if isinstance(strip, str) else strip
|
|
self.convert = [convert] if isinstance(convert, str) else convert
|
|
self.autolinks = autolinks
|
|
self.heading_style = heading_style
|
|
self.additional_options = kwargs
|
|
|
|
def transform_documents(
|
|
self,
|
|
documents: Sequence[Document],
|
|
**kwargs: Any,
|
|
) -> Sequence[Document]:
|
|
try:
|
|
from markdownify import markdownify
|
|
except ImportError:
|
|
raise ImportError(
|
|
"""markdownify package not found, please
|
|
install it with `pip install markdownify`"""
|
|
)
|
|
|
|
converted_documents = []
|
|
for doc in documents:
|
|
markdown_content = (
|
|
markdownify(
|
|
html=doc.page_content,
|
|
strip=self.strip,
|
|
convert=self.convert,
|
|
autolinks=self.autolinks,
|
|
heading_style=self.heading_style,
|
|
**self.additional_options,
|
|
)
|
|
.replace("\xa0", " ")
|
|
.strip()
|
|
)
|
|
|
|
cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)
|
|
|
|
converted_documents.append(
|
|
Document(cleaned_markdown, metadata=doc.metadata)
|
|
)
|
|
|
|
return converted_documents
|