mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-15 22:44:36 +00:00
chore(text-splitters): select ALL rules with exclusions (#32325)
Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
committed by
GitHub
parent
20401df25d
commit
0c3e8ccd0e
@@ -1,6 +1,6 @@
|
||||
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
|
||||
|
||||
from typing import Any
|
||||
import re
|
||||
|
||||
import nltk
|
||||
import pytest
|
||||
@@ -15,11 +15,8 @@ def setup_module() -> None:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spacy() -> Any:
|
||||
try:
|
||||
import spacy
|
||||
except ImportError:
|
||||
pytest.skip("Spacy not installed.")
|
||||
def spacy() -> None:
|
||||
spacy = pytest.importorskip("spacy")
|
||||
|
||||
# Check if en_core_web_sm model is available
|
||||
try:
|
||||
@@ -32,18 +29,27 @@ def spacy() -> Any:
|
||||
"en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
|
||||
)
|
||||
|
||||
return spacy
|
||||
|
||||
|
||||
def test_nltk_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=re.escape(
|
||||
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
|
||||
),
|
||||
):
|
||||
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_spacy_text_splitting_args(spacy: Any) -> None:
|
||||
@pytest.mark.usefixtures("spacy")
|
||||
def test_spacy_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=re.escape(
|
||||
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
|
||||
),
|
||||
):
|
||||
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
@@ -57,8 +63,9 @@ def test_nltk_text_splitter() -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("spacy")
|
||||
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||
def test_spacy_text_splitter(pipeline: str, spacy: Any) -> None:
|
||||
def test_spacy_text_splitter(pipeline: str) -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
@@ -68,8 +75,9 @@ def test_spacy_text_splitter(pipeline: str, spacy: Any) -> None:
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("spacy")
|
||||
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||
def test_spacy_text_splitter_strip_whitespace(pipeline: str, spacy: Any) -> None:
|
||||
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
@@ -83,7 +91,9 @@ def test_spacy_text_splitter_strip_whitespace(pipeline: str, spacy: Any) -> None
|
||||
|
||||
def test_nltk_text_splitter_args() -> None:
|
||||
"""Test invalid arguments for NLTKTextSplitter."""
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(
|
||||
ValueError, match="When use_span_tokenize is True, separator should be ''"
|
||||
):
|
||||
NLTKTextSplitter(
|
||||
chunk_size=80,
|
||||
chunk_overlap=0,
|
||||
|
@@ -1,8 +1,7 @@
|
||||
"""Test text splitters that require an integration."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
from langchain_text_splitters import (
|
||||
TokenTextSplitter,
|
||||
@@ -13,25 +12,17 @@ from langchain_text_splitters.sentence_transformers import (
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sentence_transformers() -> Any:
|
||||
try:
|
||||
import sentence_transformers
|
||||
except ImportError:
|
||||
pytest.skip("SentenceTransformers not installed.")
|
||||
return sentence_transformers
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
"""Test that type checks are done properly on input."""
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter.from_huggingface_tokenizer("foo")
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Tokenizer received was not an instance of PreTrainedTokenizerBase",
|
||||
):
|
||||
CharacterTextSplitter.from_huggingface_tokenizer("foo") # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_huggingface_tokenizer() -> None:
|
||||
"""Test text splitter that uses a HuggingFace tokenizer."""
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
|
||||
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
|
||||
@@ -63,7 +54,8 @@ def test_token_text_splitter_from_tiktoken() -> None:
|
||||
assert expected_tokenizer == actual_tokenizer
|
||||
|
||||
|
||||
def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
|
||||
@pytest.mark.requires("sentence_transformers")
|
||||
def test_sentence_transformers_count_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
@@ -78,7 +70,8 @@ def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
|
||||
assert expected_token_count == token_count
|
||||
|
||||
|
||||
def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
|
||||
@pytest.mark.requires("sentence_transformers")
|
||||
def test_sentence_transformers_split_text() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
@@ -88,7 +81,8 @@ def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
|
||||
assert expected_text_chunks == text_chunks
|
||||
|
||||
|
||||
def test_sentence_transformers_multiple_tokens(sentence_transformers: Any) -> None:
|
||||
@pytest.mark.requires("sentence_transformers")
|
||||
def test_sentence_transformers_multiple_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
|
||||
text = "Lorem "
|
||||
|
||||
|
@@ -4,10 +4,9 @@ from collections.abc import Sequence
|
||||
from importlib import util
|
||||
|
||||
import pytest
|
||||
from pytest import Config, Function, Parser
|
||||
|
||||
|
||||
def pytest_addoption(parser: Parser) -> None:
|
||||
def pytest_addoption(parser: pytest.Parser) -> None:
|
||||
"""Add custom command line options to pytest."""
|
||||
parser.addoption(
|
||||
"--only-extended",
|
||||
@@ -21,7 +20,9 @@ def pytest_addoption(parser: Parser) -> None:
|
||||
)
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None:
|
||||
def pytest_collection_modifyitems(
|
||||
config: pytest.Config, items: Sequence[pytest.Function]
|
||||
) -> None:
|
||||
"""Add implementations for handling custom markers.
|
||||
|
||||
At the moment, this adds support for a custom `requires` marker.
|
||||
@@ -64,7 +65,7 @@ def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) ->
|
||||
if pkg not in required_pkgs_info:
|
||||
try:
|
||||
installed = util.find_spec(pkg) is not None
|
||||
except Exception:
|
||||
except (ImportError, ValueError):
|
||||
installed = False
|
||||
required_pkgs_info[pkg] = installed
|
||||
|
||||
|
@@ -1,11 +1,14 @@
|
||||
"""Test text splitting functionality."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
from typing import Any, Callable
|
||||
|
||||
import pytest
|
||||
from bs4 import Tag
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_text_splitters import (
|
||||
@@ -103,7 +106,9 @@ def test_character_text_splitter_longer_words() -> None:
|
||||
def test_character_text_splitter_keep_separator_regex(
|
||||
*, separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters while keeping the separator
|
||||
"""Test CharacterTextSplitter keep separator regex.
|
||||
|
||||
Test splitting by characters while keeping the separator
|
||||
that is a regex special character.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
@@ -125,7 +130,9 @@ def test_character_text_splitter_keep_separator_regex(
|
||||
def test_character_text_splitter_keep_separator_regex_start(
|
||||
*, separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters while keeping the separator
|
||||
"""Test CharacterTextSplitter keep separator regex and put at start.
|
||||
|
||||
Test splitting by characters while keeping the separator
|
||||
that is a regex special character and placing it at the start of each chunk.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
@@ -147,7 +154,9 @@ def test_character_text_splitter_keep_separator_regex_start(
|
||||
def test_character_text_splitter_keep_separator_regex_end(
|
||||
*, separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters while keeping the separator
|
||||
"""Test CharacterTextSplitter keep separator regex and put at end.
|
||||
|
||||
Test splitting by characters while keeping the separator
|
||||
that is a regex special character and placing it at the end of each chunk.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
@@ -169,8 +178,11 @@ def test_character_text_splitter_keep_separator_regex_end(
|
||||
def test_character_text_splitter_discard_separator_regex(
|
||||
*, separator: str, is_separator_regex: bool
|
||||
) -> None:
|
||||
"""Test splitting by characters discarding the separator
|
||||
that is a regex special character."""
|
||||
"""Test CharacterTextSplitter discard separator regex.
|
||||
|
||||
Test splitting by characters discarding the separator
|
||||
that is a regex special character.
|
||||
"""
|
||||
text = "foo.bar.baz.123"
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=separator,
|
||||
@@ -210,12 +222,17 @@ def test_recursive_character_text_splitter_keep_separators() -> None:
|
||||
|
||||
def test_character_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=re.escape(
|
||||
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
|
||||
),
|
||||
):
|
||||
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
for invalid_size in (0, -1):
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(ValueError, match="chunk_size must be > 0, got"):
|
||||
CharacterTextSplitter(chunk_size=invalid_size)
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.raises(ValueError, match="chunk_overlap must be >= 0, got -1"):
|
||||
CharacterTextSplitter(chunk_size=2, chunk_overlap=-1)
|
||||
|
||||
|
||||
@@ -1164,7 +1181,6 @@ def test_html_code_splitter() -> None:
|
||||
|
||||
def test_md_header_text_splitter_1() -> None:
|
||||
"""Test markdown splitter by header: Case 1."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bar\n\n"
|
||||
@@ -1235,7 +1251,6 @@ def test_md_header_text_splitter_2() -> None:
|
||||
|
||||
def test_md_header_text_splitter_3() -> None:
|
||||
"""Test markdown splitter by header: Case 3."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bar\n\n"
|
||||
@@ -1290,7 +1305,6 @@ def test_md_header_text_splitter_3() -> None:
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_1() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bat\n\n"
|
||||
@@ -1324,7 +1338,6 @@ def test_md_header_text_splitter_preserve_headers_1() -> None:
|
||||
|
||||
def test_md_header_text_splitter_preserve_headers_2() -> None:
|
||||
"""Test markdown splitter by header: Preserve Headers."""
|
||||
|
||||
markdown_document = (
|
||||
"# Foo\n\n"
|
||||
" ## Bar\n\n"
|
||||
@@ -1372,7 +1385,6 @@ def test_md_header_text_splitter_preserve_headers_2() -> None:
|
||||
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
|
||||
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
||||
markdown_document = (
|
||||
f"# This is a Header\n\n{fence}\nfoo()\n# Not a header\nbar()\n{fence}"
|
||||
)
|
||||
@@ -1402,7 +1414,6 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
|
||||
fence: str, other_fence: str
|
||||
) -> None:
|
||||
"""Test markdown splitter by header: Interleaved fenced code block."""
|
||||
|
||||
markdown_document = (
|
||||
"# This is a Header\n\n"
|
||||
f"{fence}\n"
|
||||
@@ -1438,7 +1449,6 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
|
||||
@pytest.mark.parametrize("characters", ["\ufeff"])
|
||||
def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:
|
||||
"""Test markdown splitter by header: Fenced code block."""
|
||||
|
||||
markdown_document = f"{characters}# Foo\n\nfoo()\n{characters}## Bar\n\nbar()"
|
||||
|
||||
headers_to_split_on = [
|
||||
@@ -1609,7 +1619,6 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = (
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter() -> None:
|
||||
"""Test experimental markdown syntax splitter."""
|
||||
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
|
||||
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
|
||||
|
||||
@@ -1663,7 +1672,6 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:
|
||||
"""Test experimental markdown syntax splitter."""
|
||||
|
||||
headers_to_split_on = [("#", "Encabezamiento 1")]
|
||||
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
|
||||
@@ -1709,7 +1717,6 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
||||
"""Test experimental markdown syntax splitter."""
|
||||
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
|
||||
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
|
||||
|
||||
@@ -1768,7 +1775,6 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
|
||||
"""Test experimental markdown syntax splitter."""
|
||||
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
|
||||
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
|
||||
|
||||
@@ -1876,8 +1882,11 @@ EXPERIMENTAL_MARKDOWN_DOCUMENTS = [
|
||||
|
||||
|
||||
def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
|
||||
"""Test experimental markdown syntax splitter split
|
||||
on default called consecutively on two files."""
|
||||
"""Test ExperimentalMarkdownSyntaxTextSplitter on multiple files.
|
||||
|
||||
Test experimental markdown syntax splitter split on default called consecutively
|
||||
on two files.
|
||||
"""
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
@@ -1958,8 +1967,11 @@ def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
|
||||
def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown syntax splitter split
|
||||
on each line called consecutively on two files."""
|
||||
"""Test ExperimentalMarkdownSyntaxTextSplitter split lines on multiple files.
|
||||
|
||||
Test experimental markdown syntax splitter split on each line called consecutively
|
||||
on two files.
|
||||
"""
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
@@ -2083,9 +2095,10 @@ def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files()
|
||||
def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown splitter
|
||||
by header called consecutively on two files"""
|
||||
"""Test ExperimentalMarkdownSyntaxTextSplitter with header on multiple files.
|
||||
|
||||
Test experimental markdown splitter by header called consecutively on two files.
|
||||
"""
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
|
||||
output = []
|
||||
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
|
||||
@@ -2171,9 +2184,11 @@ def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files()
|
||||
def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> (
|
||||
None
|
||||
):
|
||||
"""Test experimental markdown splitter
|
||||
by header configuration called consecutively on two files"""
|
||||
"""Test ExperimentalMarkdownSyntaxTextSplitter header config on multiple files.
|
||||
|
||||
Test experimental markdown splitter by header configuration called consecutively
|
||||
on two files.
|
||||
"""
|
||||
headers_to_split_on = [("#", "Encabezamiento 1")]
|
||||
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
@@ -2354,8 +2369,8 @@ def test_haskell_code_splitter() -> None:
|
||||
def html_header_splitter_splitter_factory() -> Callable[
|
||||
[list[tuple[str, str]]], HTMLHeaderTextSplitter
|
||||
]:
|
||||
"""
|
||||
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
|
||||
"""Fixture to create an HTMLHeaderTextSplitter instance with given headers.
|
||||
|
||||
This factory allows dynamic creation of splitters with different headers.
|
||||
"""
|
||||
|
||||
@@ -2553,14 +2568,15 @@ def html_header_splitter_splitter_factory() -> Callable[
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
html_header_splitter_splitter_factory: Callable[
|
||||
[list[tuple[str, str]]], HTMLHeaderTextSplitter
|
||||
],
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
html_input: str,
|
||||
expected_documents: list[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
"""Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
@@ -2574,10 +2590,7 @@ def test_html_header_text_splitter(
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected values.
|
||||
"""
|
||||
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
|
||||
docs = splitter.split_text(html_input)
|
||||
|
||||
assert len(docs) == len(expected_documents), (
|
||||
@@ -2709,14 +2722,15 @@ def test_html_header_text_splitter(
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_additional_html_header_text_splitter(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
html_header_splitter_splitter_factory: Callable[
|
||||
[list[tuple[str, str]]], HTMLHeaderTextSplitter
|
||||
],
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: list[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test the HTML header text splitter.
|
||||
"""Test the HTML header text splitter.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory function to create
|
||||
@@ -2730,9 +2744,7 @@ def test_additional_html_header_text_splitter(
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
@@ -2780,14 +2792,16 @@ def test_additional_html_header_text_splitter(
|
||||
)
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_no_headers_with_multiple_splitters(
|
||||
html_header_splitter_splitter_factory: Any,
|
||||
html_header_splitter_splitter_factory: Callable[
|
||||
[list[tuple[str, str]]], HTMLHeaderTextSplitter
|
||||
],
|
||||
headers_to_split_on: list[tuple[str, str]],
|
||||
html_content: str,
|
||||
expected_output: list[Document],
|
||||
test_case: str,
|
||||
) -> None:
|
||||
"""
|
||||
Test HTML content splitting without headers using multiple splitters.
|
||||
"""Test HTML content splitting without headers using multiple splitters.
|
||||
|
||||
Args:
|
||||
html_header_splitter_splitter_factory (Any): Factory to create the
|
||||
HTML header splitter.
|
||||
@@ -2796,13 +2810,12 @@ def test_html_no_headers_with_multiple_splitters(
|
||||
expected_output (List[Document]): Expected list of Document objects
|
||||
after splitting.
|
||||
test_case (str): Description of the test case.
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of documents or their content/metadata
|
||||
does not match the expected output.
|
||||
"""
|
||||
splitter = html_header_splitter_splitter_factory(
|
||||
headers_to_split_on=headers_to_split_on
|
||||
)
|
||||
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
|
||||
docs = splitter.split_text(html_content)
|
||||
|
||||
assert len(docs) == len(expected_output), (
|
||||
@@ -3046,7 +3059,7 @@ def test_happy_path_splitting_with_duplicate_header_tag() -> None:
|
||||
|
||||
|
||||
def test_split_json() -> None:
|
||||
"""Test json text splitter"""
|
||||
"""Test json text splitter."""
|
||||
max_chunk = 800
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
|
||||
|
||||
@@ -3068,7 +3081,7 @@ def test_split_json() -> None:
|
||||
|
||||
|
||||
def test_split_json_with_lists() -> None:
|
||||
"""Test json text splitter with list conversion"""
|
||||
"""Test json text splitter with list conversion."""
|
||||
max_chunk = 800
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
|
||||
|
||||
@@ -3240,7 +3253,7 @@ def test_visualbasic6_code_splitter() -> None:
|
||||
]
|
||||
|
||||
|
||||
def custom_iframe_extractor(iframe_tag: Any) -> str:
|
||||
def custom_iframe_extractor(iframe_tag: Tag) -> str:
|
||||
iframe_src = iframe_tag.get("src", "")
|
||||
return f"[iframe:{iframe_src}]({iframe_src})"
|
||||
|
||||
@@ -3324,8 +3337,11 @@ def test_html_splitter_with_nested_elements() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_with_preserved_elements() -> None:
|
||||
"""Test HTML splitting with preserved elements like <table>, <ul> with low chunk
|
||||
size."""
|
||||
"""Test HTML splitter with preserved elements.
|
||||
|
||||
Test HTML splitting with preserved elements like <table>, <ul> with low chunk
|
||||
size.
|
||||
"""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<table>
|
||||
@@ -3563,8 +3579,11 @@ def test_html_splitter_with_no_headers() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_with_media_preservation() -> None:
|
||||
"""Test HTML splitting with media elements preserved and converted to Markdown-like
|
||||
links."""
|
||||
"""Test HTML splitter with media preservation.
|
||||
|
||||
Test HTML splitting with media elements preserved and converted to Markdown-like
|
||||
links.
|
||||
"""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is an image:</p>
|
||||
@@ -3600,7 +3619,7 @@ def test_html_splitter_with_media_preservation() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_true() -> None:
|
||||
"""Test HTML splitting with keep_separator=True"""
|
||||
"""Test HTML splitting with keep_separator=True."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
@@ -3629,7 +3648,7 @@ def test_html_splitter_keep_separator_true() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_false() -> None:
|
||||
"""Test HTML splitting with keep_separator=False"""
|
||||
"""Test HTML splitting with keep_separator=False."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
@@ -3658,7 +3677,7 @@ def test_html_splitter_keep_separator_false() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_start() -> None:
|
||||
"""Test HTML splitting with keep_separator="start" """
|
||||
"""Test HTML splitting with keep_separator="start"."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
@@ -3687,7 +3706,7 @@ def test_html_splitter_keep_separator_start() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_end() -> None:
|
||||
"""Test HTML splitting with keep_separator="end" """
|
||||
"""Test HTML splitting with keep_separator="end"."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
@@ -3716,7 +3735,7 @@ def test_html_splitter_keep_separator_end() -> None:
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_html_splitter_keep_separator_default() -> None:
|
||||
"""Test HTML splitting with keep_separator not set"""
|
||||
"""Test HTML splitting with keep_separator not set."""
|
||||
html_content = """
|
||||
<h1>Section 1</h1>
|
||||
<p>This is some text. This is some other text.</p>
|
||||
|
Reference in New Issue
Block a user