diff --git a/libs/langchain/langchain/document_transformers/html2text.py b/libs/langchain/langchain/document_transformers/html2text.py index ede87af9c7c..3c123950d3c 100644 --- a/libs/langchain/langchain/document_transformers/html2text.py +++ b/libs/langchain/langchain/document_transformers/html2text.py @@ -5,13 +5,22 @@ from langchain.schema import BaseDocumentTransformer, Document class Html2TextTransformer(BaseDocumentTransformer): """Replace occurrences of a particular search pattern with a replacement string + + Arguments: + ignore_links: Whether links should be ignored; defaults to True. + ignore_images: Whether images should be ignored; defaults to True. + Example: .. code-block:: python from langchain.document_transformers import Html2TextTransformer - html2text=Html2TextTransformer() - docs_transform=html2text.transform_documents(docs) + html2text = Html2TextTransformer() + docs_transform = html2text.transform_documents(docs) """ + def __init__(self, ignore_links: bool = True, ignore_images: bool = True) -> None: + self.ignore_links = ignore_links + self.ignore_images = ignore_images + def transform_documents( self, documents: Sequence[Document], @@ -25,10 +34,11 @@ class Html2TextTransformer(BaseDocumentTransformer): install it with `pip install html2text`""" ) - # Create an html2text.HTML2Text object and override some properties + # Create a html2text.HTML2Text object and override some properties h = html2text.HTML2Text() - h.ignore_links = True - h.ignore_images = True + h.ignore_links = self.ignore_links + h.ignore_images = self.ignore_images + for d in documents: d.page_content = h.handle(d.page_content) return documents