From e0a4af8d8b024895865aba9e02bd9b8791ecc33a Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Sun, 31 Aug 2025 20:46:11 +0200 Subject: [PATCH] docs(text-splitters): fix some docstrings (#32767) --- .../langchain_text_splitters/html.py | 122 +++++++++--------- .../langchain_text_splitters/json.py | 24 ++-- .../langchain_text_splitters/jsx.py | 15 ++- .../langchain_text_splitters/markdown.py | 44 +++---- 4 files changed, 99 insertions(+), 106 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 55090cdd234..4d78fbcc35a 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -53,23 +53,8 @@ class HTMLHeaderTextSplitter: gracefully handles multiple levels of nested headers, creating a rich, hierarchical representation of the content. - Args: - headers_to_split_on (List[Tuple[str, str]]): A list of (header_tag, - header_name) pairs representing the headers that define splitting - boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")] - will split content by

and

tags, assigning their textual - content to the Document metadata. - return_each_element (bool): If True, every HTML element encountered - (including headers, paragraphs, etc.) is returned as a separate - Document. If False, content under the same header hierarchy is - aggregated into fewer Documents. - - Returns: - List[Document]: A list of Document objects. Each Document contains - `page_content` holding the extracted text and `metadata` that maps - the header hierarchy to their corresponding titles. - Example: + .. code-block:: python from langchain_text_splitters.html_header_text_splitter import ( @@ -123,10 +108,15 @@ class HTMLHeaderTextSplitter: """Initialize with headers to split on. Args: - headers_to_split_on: A list of tuples where - each tuple contains a header tag and its corresponding value. - return_each_element: Whether to return each HTML - element as a separate Document. Defaults to False. + headers_to_split_on: A list of (header_tag, + header_name) pairs representing the headers that define splitting + boundaries. For example, [("h1", "Header 1"), ("h2", "Header 2")] + will split content by

and

tags, assigning their textual + content to the Document metadata. + return_each_element: If True, every HTML element encountered + (including headers, paragraphs, etc.) is returned as a separate + Document. If False, content under the same header hierarchy is + aggregated into fewer Documents. """ # Sort headers by their numeric level so that h1 < h2 < h3... self.headers_to_split_on = sorted( @@ -143,7 +133,9 @@ class HTMLHeaderTextSplitter: text: The HTML text to split. Returns: - A list of split Document objects. + A list of split Document objects. Each Document contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. """ return self.split_text_from_file(StringIO(text)) @@ -158,7 +150,9 @@ class HTMLHeaderTextSplitter: **kwargs: Additional keyword arguments for the request. Returns: - A list of split Document objects. + A list of split Document objects. Each Document contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. Raises: requests.RequestException: If the HTTP request fails. @@ -179,7 +173,9 @@ class HTMLHeaderTextSplitter: file: A file path or a file-like object containing HTML content. Returns: - A list of split Document objects. + A list of split Document objects. Each Document contains + `page_content` holding the extracted text and `metadata` that maps + the header hierarchy to their corresponding titles. """ if isinstance(file, str): with open(file, encoding="utf-8") as f: @@ -384,10 +380,11 @@ class HTMLSectionSplitter: Returns: List[Dict[str, Optional[str]]]: A list of dictionaries representing sections. - Each dictionary contains: - - 'header': The header text or a default title for the first section. - - 'content': The content under the header. - - 'tag_name': The name of the header tag (e.g., "h1", "h2"). + Each dictionary contains: + + * 'header': The header text or a default title for the first section. + * 'content': The content under the header. + * 'tag_name': The name of the header tag (e.g., "h1", "h2"). """ try: from bs4 import BeautifulSoup @@ -508,40 +505,6 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer): .. versionadded: 0.3.5 - Args: - headers_to_split_on (List[Tuple[str, str]]): HTML headers (e.g., "h1", "h2") - that define content sections. - max_chunk_size (int): Maximum size for each chunk, with allowance for - exceeding this limit to preserve semantics. - chunk_overlap (int): Number of characters to overlap between chunks to ensure - contextual continuity. - separators (List[str]): Delimiters used by RecursiveCharacterTextSplitter for - further splitting. - elements_to_preserve (List[str]): HTML tags (e.g., ,
,