add HTMLHeaderTextSplitter (#11039)

Description: Similar in concept to the `MarkdownHeaderTextSplitter`, the `HTMLHeaderTextSplitter` is a "structure-aware" chunker that splits text at the element level and adds metadata for each header "relevant" to any given chunk. It can return chunks element by element or combine elements with the same metadata, with the objectives of (a) keeping related text grouped (more or less) semantically and (b) preserving context-rich information encoded in document structures. It can be used with other text splitters as part of a chunking pipeline. Dependency: lxml python package Maintainer: @hwchase17 Twitter handle: @MartinZirulnik --------- Co-authored-by: PresidioVantage <github@presidiovantage.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-14 14:05:37 +00:00 · 2023-10-04 06:24:25 -07:00
parent 289de601c8
commit 9e3c1d4463
4 changed files with 642 additions and 27 deletions
--- a/libs/langchain/langchain/document_transformers/xsl/html_chunks_with_headers.xslt
+++ b/libs/langchain/langchain/document_transformers/xsl/html_chunks_with_headers.xslt
@@ -0,0 +1,199 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!-- HTML PRE CHUNK:
+This performs a best-effort preliminary "chunking" of text in an HTML file,
+matching each chunk with a "headers" metadata value based on header tags in proximity.
+
+recursively visits every element (template mode=list).
+for every element with tagname of interest (only):
+1. serializes a div (and metadata marking the element's xpath).
+2. calculates all text-content for the given element, including descendant elements which are *not* themselves tags of interest.
+3. if any such text-content was found, serializes a "headers" (span.headers) along with this text (span.chunk).
+
+to calculate the "headers" of an element:
+1. recursively gets the *nearest* prior-siblings for headings of *each* level
+2. recursively repeats that step#1 for each ancestor (regardless of tag)
+n.b. this recursion is only performed (beginning with) elements which are
+both (1) tags-of-interest and (2) have their own text-content.
+-->
+<xsl:stylesheet version="1.0"
+	xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+	xmlns="http://www.w3.org/1999/xhtml">
+	
+	<xsl:param name="tags">div|p|blockquote|ol|ul</xsl:param>
+	
+	<xsl:template match="/">
+		<html>
+			<head>
+				<style>
+					div {
+						border: solid;
+						margin-top: .5em;
+						padding-left: .5em;
+					}
+					
+					h1, h2, h3, h4, h5, h6 {
+						margin: 0;
+					}
+					
+					.xpath {
+						color: blue;
+					}
+					.chunk {
+						margin: .5em 1em;
+					}
+				</style>
+			</head>
+			<body>
+				<!-- create "filtered tree" with only tags of interest -->
+				<xsl:apply-templates select="*" />
+			</body>
+		</html>
+	</xsl:template>
+	
+	<xsl:template match="*">
+		<xsl:choose>
+			<!-- tags of interest get serialized into the filtered tree (and recurse down child elements) -->
+			<xsl:when test="contains(
+				concat('|', $tags, '|'),
+				concat('|', local-name(), '|'))">
+			
+				<xsl:variable name="xpath">
+					<xsl:apply-templates mode="xpath" select="." />
+				</xsl:variable>
+				<xsl:variable name="txt">
+					<!-- recurse down child text-nodes and elements -->
+					<xsl:apply-templates mode="text" />
+				</xsl:variable>
+				<xsl:variable name="txt-norm" select="normalize-space($txt)" />
+				
+				<div title="{$xpath}">
+					
+					<small class="xpath">
+						<xsl:value-of select="$xpath" />
+					</small>
+					
+					<xsl:if test="$txt-norm">
+						<xsl:variable name="headers">
+							<xsl:apply-templates mode="headingsWithAncestors" select="." />
+						</xsl:variable>
+						
+						<xsl:if test="normalize-space($headers)">
+							<span class="headers">
+								<xsl:copy-of select="$headers" />
+							</span>
+						</xsl:if>
+					
+						<p class="chunk">
+							<xsl:value-of select="$txt-norm" />
+						</p>
+					</xsl:if>
+					
+					<xsl:apply-templates select="*" />
+				</div>
+			</xsl:when>
+			
+			<!-- all other tags get "skipped" and recurse down child elements -->
+			<xsl:otherwise>
+				<xsl:apply-templates select="*" />
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+	
+	
+	<!-- text mode:
+	prints text nodes;
+	for elements, recurses down child nodes (text and elements) *except* certain exceptions:
+		tags of interest (handled in their own list-mode match),
+		non-content text (e.g. script|style)
+	-->
+	
+	<!-- ignore non-content text -->
+	<xsl:template mode="text" match="
+		script|style" />
+	<!-- for all other elements *except tags of interest*, recurse on child-nodes (text and elements) -->
+	<xsl:template mode="text" match="*">
+		<xsl:choose>
+			<!-- ignore tags of interest -->
+			<xsl:when test="contains(
+				concat('|', $tags, '|'),
+				concat('|', local-name(), '|'))" />
+			
+			<xsl:otherwise>
+				<xsl:apply-templates mode="text" />
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+	
+	
+	<!-- xpath mode:
+	return an xpath which matches this element uniquely
+	-->
+	<xsl:template mode="xpath" match="*">
+		<!-- recurse up parents -->
+		<xsl:apply-templates mode="xpath" select="parent::*" />
+		
+		<xsl:value-of select="name()" />
+		<xsl:text>[</xsl:text>
+		<xsl:value-of select="1+count(preceding-sibling::*)" />
+		<xsl:text>]/</xsl:text>
+	</xsl:template>
+	
+	
+	<!-- headingsWithAncestors mode:
+	recurses up parents (ALL ancestors)
+	-->
+	<xsl:template mode="headingsWithAncestors" match="*">
+		<!-- recurse -->
+		<xsl:apply-templates mode="headingsWithAncestors" select="parent::*" />
+		
+		<xsl:apply-templates mode="headingsWithPriorSiblings" select=".">
+			<xsl:with-param name="maxHead" select="6" />
+		</xsl:apply-templates>
+	</xsl:template>
+	
+	
+	<!-- headingsWithPriorSiblings mode:
+	recurses up preceding-siblings
+	-->
+	<xsl:template mode="headingsWithPriorSiblings" match="*">
+		<xsl:param name="maxHead" />
+		<xsl:variable name="headLevel" select="number(substring(local-name(), 2))" />
+		
+		<xsl:choose>
+			<xsl:when test="'h' = substring(local-name(), 1, 1) and $maxHead >= $headLevel">
+				
+				<!-- recurse up to prior sibling; max level one less than current -->
+				<xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]">
+					<xsl:with-param name="maxHead" select="$headLevel - 1" />
+				</xsl:apply-templates>
+				
+				<xsl:apply-templates mode="heading" select="." />
+				
+			</xsl:when>
+			
+			<!-- special case for 'header' tag, serialize child-headers -->
+			<xsl:when test="self::header">
+				<xsl:apply-templates mode="heading" select="h1|h2|h3|h4|h5|h6" />
+				<!--
+				we choose not to recurse further up prior-siblings in this case,
+				but n.b. the 'headingsWithAncestors' template above will still continue recursion.
+				-->
+			</xsl:when>
+			
+			<xsl:otherwise>
+				<!-- recurse up to prior sibling; no other work on this element -->
+				<xsl:apply-templates mode="headingsWithPriorSiblings" select="preceding-sibling::*[1]">
+					<xsl:with-param name="maxHead" select="$maxHead" />
+				</xsl:apply-templates>
+			</xsl:otherwise>
+			
+		</xsl:choose>
+	</xsl:template>
+	
+	<xsl:template mode="heading" match="h1|h2|h3|h4|h5|h6">
+		<xsl:copy>
+			<xsl:value-of select="normalize-space(.)" />
+		</xsl:copy>
+	</xsl:template>
+	
+</xsl:stylesheet>
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@@ -8,7 +8,7 @@
    BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                                 RecursiveCharacterTextSplitter -->  <name>TextSplitter

-Note: **MarkdownHeaderTextSplitter** does not derive from TextSplitter.
+Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter.


 **Main helpers:**
@@ -23,10 +23,12 @@ from __future__ import annotations

 import copy
 import logging
+import pathlib
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
+from io import BytesIO, StringIO
 from typing import (
    AbstractSet,
    Any,
@@ -46,6 +48,8 @@ from typing import (
    cast,
 )

+import requests
+
 from langchain.docstore.document import Document
 from langchain.schema import BaseDocumentTransformer

@@ -463,6 +467,159 @@ class MarkdownHeaderTextSplitter:
            ]


+class ElementType(TypedDict):
+    """Element type as typed dict."""
+
+    url: str
+    xpath: str
+    content: str
+    metadata: Dict[str, str]
+
+
+class HTMLHeaderTextSplitter:
+    """
+    Splitting HTML files based on specified headers.
+    Requires lxml package.
+    """
+
+    def __init__(
+        self,
+        headers_to_split_on: List[Tuple[str, str]],
+        return_each_element: bool = False,
+    ):
+        """Create a new HTMLHeaderTextSplitter.
+
+        Args:
+            headers_to_split_on: list of tuples of headers we want to track mapped to
+                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
+                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
+            return_each_element: Return each element w/ associated headers.
+        """
+        # Output element-by-element or aggregated into chunks w/ common headers
+        self.return_each_element = return_each_element
+        self.headers_to_split_on = sorted(headers_to_split_on)
+
+    def aggregate_elements_to_chunks(
+        self, elements: List[ElementType]
+    ) -> List[Document]:
+        """Combine elements with common metadata into chunks
+
+        Args:
+            elements: HTML element content with associated identifying info and metadata
+        """
+        aggregated_chunks: List[ElementType] = []
+
+        for element in elements:
+            if (
+                aggregated_chunks
+                and aggregated_chunks[-1]["metadata"] == element["metadata"]
+            ):
+                # If the last element in the aggregated list
+                # has the same metadata as the current element,
+                # append the current content to the last element's content
+                aggregated_chunks[-1]["content"] += "  \n" + element["content"]
+            else:
+                # Otherwise, append the current element to the aggregated list
+                aggregated_chunks.append(element)
+
+        return [
+            Document(page_content=chunk["content"], metadata=chunk["metadata"])
+            for chunk in aggregated_chunks
+        ]
+
+    def split_text_from_url(self, url: str) -> List[Document]:
+        """Split HTML from web URL
+
+        Args:
+            url: web URL
+        """
+        r = requests.get(url)
+        return self.split_text_from_file(BytesIO(r.content))
+
+    def split_text(self, text: str) -> List[Document]:
+        """Split HTML text string
+
+        Args:
+            text: HTML text
+        """
+        return self.split_text_from_file(StringIO(text))
+
+    def split_text_from_file(self, file: Any) -> List[Document]:
+        """Split HTML file
+
+        Args:
+            file: HTML file
+        """
+        try:
+            from lxml import etree
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import lxml, please install with `pip install lxml`."
+            ) from e
+        # use lxml library to parse html document and return xml ElementTree
+        parser = etree.HTMLParser()
+        tree = etree.parse(file, parser)
+
+        # document transformation for "structure-aware" chunking is handled with xsl.
+        # see comments in html_chunks_with_headers.xslt for more detailed information.
+        xslt_path = (
+            pathlib.Path(__file__).parent
+            / "document_transformers/xsl/html_chunks_with_headers.xslt"
+        )
+        xslt_tree = etree.parse(xslt_path)
+        transform = etree.XSLT(xslt_tree)
+        result = transform(tree)
+        result_dom = etree.fromstring(str(result))
+
+        # create filter and mapping for header metadata
+        header_filter = [header[0] for header in self.headers_to_split_on]
+        header_mapping = dict(self.headers_to_split_on)
+
+        # map xhtml namespace prefix
+        ns_map = {"h": "http://www.w3.org/1999/xhtml"}
+
+        # build list of elements from DOM
+        elements = []
+        for element in result_dom.findall("*//*", ns_map):
+            if element.findall("*[@class='headers']") or element.findall(
+                "*[@class='chunk']"
+            ):
+                elements.append(
+                    ElementType(
+                        url=file,
+                        xpath="".join(
+                            [
+                                node.text
+                                for node in element.findall("*[@class='xpath']", ns_map)
+                            ]
+                        ),
+                        content="".join(
+                            [
+                                node.text
+                                for node in element.findall("*[@class='chunk']", ns_map)
+                            ]
+                        ),
+                        metadata={
+                            # Add text of specified headers to metadata using header
+                            # mapping.
+                            header_mapping[node.tag]: node.text
+                            for node in filter(
+                                lambda x: x.tag in header_filter,
+                                element.findall("*[@class='headers']/*", ns_map),
+                            )
+                        },
+                    )
+                )
+
+        if not self.return_each_element:
+            return self.aggregate_elements_to_chunks(elements)
+        else:
+            return [
+                Document(page_content=chunk["content"], metadata=chunk["metadata"])
+                for chunk in elements
+            ]
+
+
 # should be in newer Python versions (3.10+)
 # @dataclass(frozen=True, kw_only=True, slots=True)
@dataclass(frozen=True)