security: Remove xslt_path and harden XML parsers in HTMLSectionSplitter: package: langchain-text-splitters (#31819)

## Summary
- Removes the `xslt_path` parameter from HTMLSectionSplitter to
eliminate XXE attack vector
- Hardens XML/HTML parsers with secure configurations to prevent XXE
attacks
- Adds comprehensive security tests to ensure the vulnerability is fixed

  ## Context
This PR addresses a critical XXE vulnerability discovered in the
HTMLSectionSplitter component. The vulnerability allowed attackers to:
- Read sensitive local files (SSH keys, passwords, configuration files)
  - Perform Server-Side Request Forgery (SSRF) attacks
  - Exfiltrate data to attacker-controlled servers

  ## Changes Made
1. **Removed `xslt_path` parameter** - This eliminates the primary
attack vector where users could supply malicious XSLT files
2. **Hardened XML parsers** - Added security configurations to prevent
XXE attacks even with the default XSLT:
     - `no_network=True` - Blocks network access
- `resolve_entities=False` - Prevents entity expansion -
`load_dtd=False` - Disables DTD processing -
`XSLTAccessControl.DENY_ALL` - Blocks all file/network I/O in XSLT
transformations

3. **Added security tests** - New test file `test_html_security.py` with
comprehensive tests for various XXE attack vectors
4. **Updated existing tests** - Modified tests that were using the
removed `xslt_path` parameter

  ## Test Plan
  - [x] All existing tests pass
  - [x] New security tests verify XXE attacks are blocked
  - [x] Code passes linting and formatting checks
  - [x] Tested with both old and new versions of lxml


Twitter handle: @_colemurray
This commit is contained in:
Cole Murray 2025-07-02 12:24:08 -07:00 committed by GitHub
parent 815d11ed6a
commit 43eef43550
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 146 additions and 47 deletions

View File

@ -309,7 +309,6 @@ class HTMLSectionSplitter:
def __init__( def __init__(
self, self,
headers_to_split_on: List[Tuple[str, str]], headers_to_split_on: List[Tuple[str, str]],
xslt_path: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Create a new HTMLSectionSplitter. """Create a new HTMLSectionSplitter.
@ -318,20 +317,13 @@ class HTMLSectionSplitter:
headers_to_split_on: list of tuples of headers we want to track mapped to headers_to_split_on: list of tuples of headers we want to track mapped to
(arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4, (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"]. h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
xslt_path: path to xslt file for document transformation.
Uses a default if not passed.
Needed for html contents that using different format and layouts.
**kwargs (Any): Additional optional arguments for customizations. **kwargs (Any): Additional optional arguments for customizations.
""" """
self.headers_to_split_on = dict(headers_to_split_on) self.headers_to_split_on = dict(headers_to_split_on)
self.xslt_path = (
if xslt_path is None: pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
self.xslt_path = ( ).absolute()
pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
).absolute()
else:
self.xslt_path = pathlib.Path(xslt_path).absolute()
self.kwargs = kwargs self.kwargs = kwargs
def split_documents(self, documents: Iterable[Document]) -> List[Document]: def split_documents(self, documents: Iterable[Document]) -> List[Document]:
@ -457,11 +449,20 @@ class HTMLSectionSplitter:
"Unable to import lxml, please install with `pip install lxml`." "Unable to import lxml, please install with `pip install lxml`."
) from e ) from e
# use lxml library to parse html document and return xml ElementTree # use lxml library to parse html document and return xml ElementTree
parser = etree.HTMLParser() # Create secure parsers to prevent XXE attacks
tree = etree.parse(StringIO(html_content), parser) html_parser = etree.HTMLParser(no_network=True)
xslt_parser = etree.XMLParser(
resolve_entities=False, no_network=True, load_dtd=False
)
xslt_tree = etree.parse(self.xslt_path) # Apply XSLT access control to prevent file/network access
transform = etree.XSLT(xslt_tree) # DENY_ALL is a predefined access control that blocks all file/network access
# Type ignore needed due to incomplete lxml type stubs
ac = etree.XSLTAccessControl.DENY_ALL # type: ignore[attr-defined]
tree = etree.parse(StringIO(html_content), html_parser)
xslt_tree = etree.parse(self.xslt_path, xslt_parser)
transform = etree.XSLT(xslt_tree, access_control=ac)
result = transform(tree) result = transform(tree)
return str(result) return str(result)

View File

@ -0,0 +1,130 @@
"""Security tests for HTML splitters to prevent XXE attacks."""
import pytest
from langchain_text_splitters.html import HTMLSectionSplitter
@pytest.mark.requires("lxml", "bs4")
class TestHTMLSectionSplitterSecurity:
"""Security tests for HTMLSectionSplitter to ensure XXE prevention."""
def test_xxe_entity_attack_blocked(self) -> None:
"""Test that external entity attacks are blocked."""
# Create HTML content to process
html_content = """<html><body><p>Test content</p></body></html>"""
# Since xslt_path parameter is removed, this attack vector is eliminated
# The splitter should use only the default XSLT
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML - should not contain any external entity content
result = splitter.split_text(html_content)
# Verify that no external entity content is present
all_content = " ".join([doc.page_content for doc in result])
assert "root:" not in all_content # /etc/passwd content
assert "XXE Attack Result" not in all_content
def test_xxe_document_function_blocked(self) -> None:
"""Test that XSLT document() function attacks are blocked."""
# Even if someone modifies the default XSLT internally,
# the secure parser configuration should block document() attacks
html_content = (
"""<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
)
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML safely
result = splitter.split_text(html_content)
# Should process normally without any security issues
assert len(result) > 0
assert any("Test content" in doc.page_content for doc in result)
def test_secure_parser_configuration(self) -> None:
"""Test that parsers are configured with security settings."""
# This test verifies our security hardening is in place
html_content = """<html><body><h1>Test</h1></body></html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# The convert_possible_tags_to_header method should use secure parsers
result = splitter.convert_possible_tags_to_header(html_content)
# Result should be valid transformed HTML
assert result is not None
assert isinstance(result, str)
def test_no_network_access(self) -> None:
"""Test that network access is blocked in parsers."""
# Create HTML that might trigger network access
html_with_external_ref = """<?xml version="1.0"?>
<!DOCTYPE html [
<!ENTITY external SYSTEM "http://attacker.com/xxe">
]>
<html>
<body>
<h1>Test</h1>
<p>&external;</p>
</body>
</html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML - should not make network requests
result = splitter.split_text(html_with_external_ref)
# Verify no external content is included
all_content = " ".join([doc.page_content for doc in result])
assert "attacker.com" not in all_content
def test_dtd_processing_disabled(self) -> None:
"""Test that DTD processing is disabled."""
# HTML with DTD that attempts to define entities
html_with_dtd = """<!DOCTYPE html [
<!ELEMENT html (body)>
<!ELEMENT body (h1, p)>
<!ELEMENT h1 (#PCDATA)>
<!ELEMENT p (#PCDATA)>
<!ENTITY test "This is a test entity">
]>
<html>
<body>
<h1>Header</h1>
<p>&test;</p>
</body>
</html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML - entities should not be resolved
result = splitter.split_text(html_with_dtd)
# The entity should not be expanded
all_content = " ".join([doc.page_content for doc in result])
assert "This is a test entity" not in all_content
def test_safe_default_xslt_usage(self) -> None:
"""Test that the default XSLT file is used safely."""
# Test with HTML that has font-size styling (what the default XSLT handles)
html_with_font_size = """<html>
<body>
<span style="font-size: 24px;">Large Header</span>
<p>Content under large text</p>
<span style="font-size: 18px;">Small Header</span>
<p>Content under small text</p>
</body>
</html>"""
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
# Process the HTML using the default XSLT
result = splitter.split_text(html_with_font_size)
# Should successfully process the content
assert len(result) > 0
# Large font text should be converted to header
assert any("Large Header" in str(doc.metadata.values()) for doc in result)

View File

@ -3,7 +3,6 @@
import random import random
import re import re
import string import string
from pathlib import Path
from typing import Any, Callable, List, Tuple from typing import Any, Callable, List, Tuple
import pytest import pytest
@ -2865,37 +2864,6 @@ def test_happy_path_splitting_based_on_header_with_whitespace_chars() -> None:
assert docs[2].metadata["Header 2"] == "Baz" assert docs[2].metadata["Header 2"] == "Baz"
@pytest.mark.requires("bs4")
@pytest.mark.requires("lxml")
def test_section_splitter_accepts_a_relative_path() -> None:
html_string = """<html><body><p>Foo</p></body></html>"""
test_file = Path("tests/test_data/test_splitter.xslt")
assert test_file.is_file()
sec_splitter = HTMLSectionSplitter(
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
xslt_path=test_file.as_posix(),
)
sec_splitter.split_text(html_string)
@pytest.mark.requires("bs4")
@pytest.mark.requires("lxml")
def test_section_splitter_accepts_an_absolute_path() -> None:
html_string = """<html><body><p>Foo</p></body></html>"""
test_file = Path("tests/test_data/test_splitter.xslt").absolute()
assert test_file.is_absolute()
assert test_file.is_file()
sec_splitter = HTMLSectionSplitter(
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
xslt_path=test_file.as_posix(),
)
sec_splitter.split_text(html_string)
@pytest.mark.requires("bs4") @pytest.mark.requires("bs4")
@pytest.mark.requires("lxml") @pytest.mark.requires("lxml")
def test_happy_path_splitting_with_duplicate_header_tag() -> None: def test_happy_path_splitting_with_duplicate_header_tag() -> None: