"""Security tests for HTML splitters to prevent XXE attacks.""" import pytest from langchain_text_splitters.html import HTMLSectionSplitter @pytest.mark.requires("lxml", "bs4") class TestHTMLSectionSplitterSecurity: """Security tests for HTMLSectionSplitter to ensure XXE prevention.""" def test_xxe_entity_attack_blocked(self) -> None: """Test that external entity attacks are blocked.""" # Create HTML content to process html_content = """

Test content

""" # Since xslt_path parameter is removed, this attack vector is eliminated # The splitter should use only the default XSLT splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # Process the HTML - should not contain any external entity content result = splitter.split_text(html_content) # Verify that no external entity content is present all_content = " ".join([doc.page_content for doc in result]) assert "root:" not in all_content # /etc/passwd content assert "XXE Attack Result" not in all_content def test_xxe_document_function_blocked(self) -> None: """Test that XSLT document() function attacks are blocked.""" # Even if someone modifies the default XSLT internally, # the secure parser configuration should block document() attacks html_content = ( """

Test Header

Test content

""" ) splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # Process the HTML safely result = splitter.split_text(html_content) # Should process normally without any security issues assert len(result) > 0 assert any("Test content" in doc.page_content for doc in result) def test_secure_parser_configuration(self) -> None: """Test that parsers are configured with security settings.""" # This test verifies our security hardening is in place html_content = """

Test

""" splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # The convert_possible_tags_to_header method should use secure parsers result = splitter.convert_possible_tags_to_header(html_content) # Result should be valid transformed HTML assert result is not None assert isinstance(result, str) def test_no_network_access(self) -> None: """Test that network access is blocked in parsers.""" # Create HTML that might trigger network access html_with_external_ref = """ ]>

Test

&external;

""" splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # Process the HTML - should not make network requests result = splitter.split_text(html_with_external_ref) # Verify no external content is included all_content = " ".join([doc.page_content for doc in result]) assert "attacker.com" not in all_content def test_dtd_processing_disabled(self) -> None: """Test that DTD processing is disabled.""" # HTML with DTD that attempts to define entities html_with_dtd = """ ]>

Header

&test;

""" splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # Process the HTML - entities should not be resolved result = splitter.split_text(html_with_dtd) # The entity should not be expanded all_content = " ".join([doc.page_content for doc in result]) assert "This is a test entity" not in all_content def test_safe_default_xslt_usage(self) -> None: """Test that the default XSLT file is used safely.""" # Test with HTML that has font-size styling (what the default XSLT handles) html_with_font_size = """ Large Header

Content under large text

Small Header

Content under small text

""" splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")]) # Process the HTML using the default XSLT result = splitter.split_text(html_with_font_size) # Should successfully process the content assert len(result) > 0 # Large font text should be converted to header assert any("Large Header" in str(doc.metadata.values()) for doc in result)