chore(text-splitters): select ALL rules with exclusions (#32325)

Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
Christophe Bornet
2025-09-08 16:46:09 +02:00
committed by GitHub
parent 20401df25d
commit 0c3e8ccd0e
19 changed files with 444 additions and 366 deletions

View File

@@ -1,6 +1,6 @@
"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
from typing import Any
import re
import nltk
import pytest
@@ -15,11 +15,8 @@ def setup_module() -> None:
@pytest.fixture
def spacy() -> Any:
try:
import spacy
except ImportError:
pytest.skip("Spacy not installed.")
def spacy() -> None:
spacy = pytest.importorskip("spacy")
# Check if en_core_web_sm model is available
try:
@@ -32,18 +29,27 @@ def spacy() -> Any:
"en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
)
return spacy
def test_nltk_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match=re.escape(
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
),
):
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
def test_spacy_text_splitting_args(spacy: Any) -> None:
@pytest.mark.usefixtures("spacy")
def test_spacy_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match=re.escape(
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
),
):
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
@@ -57,8 +63,9 @@ def test_nltk_text_splitter() -> None:
assert output == expected_output
@pytest.mark.usefixtures("spacy")
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter(pipeline: str, spacy: Any) -> None:
def test_spacy_text_splitter(pipeline: str) -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
@@ -68,8 +75,9 @@ def test_spacy_text_splitter(pipeline: str, spacy: Any) -> None:
assert output == expected_output
@pytest.mark.usefixtures("spacy")
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter_strip_whitespace(pipeline: str, spacy: Any) -> None:
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
@@ -83,7 +91,9 @@ def test_spacy_text_splitter_strip_whitespace(pipeline: str, spacy: Any) -> None
def test_nltk_text_splitter_args() -> None:
"""Test invalid arguments for NLTKTextSplitter."""
with pytest.raises(ValueError):
with pytest.raises(
ValueError, match="When use_span_tokenize is True, separator should be ''"
):
NLTKTextSplitter(
chunk_size=80,
chunk_overlap=0,

View File

@@ -1,8 +1,7 @@
"""Test text splitters that require an integration."""
from typing import Any
import pytest
from transformers import GPT2TokenizerFast
from langchain_text_splitters import (
TokenTextSplitter,
@@ -13,25 +12,17 @@ from langchain_text_splitters.sentence_transformers import (
)
@pytest.fixture
def sentence_transformers() -> Any:
try:
import sentence_transformers
except ImportError:
pytest.skip("SentenceTransformers not installed.")
return sentence_transformers
def test_huggingface_type_check() -> None:
"""Test that type checks are done properly on input."""
with pytest.raises(ValueError):
CharacterTextSplitter.from_huggingface_tokenizer("foo")
with pytest.raises(
ValueError,
match="Tokenizer received was not an instance of PreTrainedTokenizerBase",
):
CharacterTextSplitter.from_huggingface_tokenizer("foo") # type: ignore[arg-type]
def test_huggingface_tokenizer() -> None:
"""Test text splitter that uses a HuggingFace tokenizer."""
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
@@ -63,7 +54,8 @@ def test_token_text_splitter_from_tiktoken() -> None:
assert expected_tokenizer == actual_tokenizer
def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
@pytest.mark.requires("sentence_transformers")
def test_sentence_transformers_count_tokens() -> None:
splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/paraphrase-albert-small-v2"
)
@@ -78,7 +70,8 @@ def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
assert expected_token_count == token_count
def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
@pytest.mark.requires("sentence_transformers")
def test_sentence_transformers_split_text() -> None:
splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/paraphrase-albert-small-v2"
)
@@ -88,7 +81,8 @@ def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
assert expected_text_chunks == text_chunks
def test_sentence_transformers_multiple_tokens(sentence_transformers: Any) -> None:
@pytest.mark.requires("sentence_transformers")
def test_sentence_transformers_multiple_tokens() -> None:
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
text = "Lorem "

View File

@@ -4,10 +4,9 @@ from collections.abc import Sequence
from importlib import util
import pytest
from pytest import Config, Function, Parser
def pytest_addoption(parser: Parser) -> None:
def pytest_addoption(parser: pytest.Parser) -> None:
"""Add custom command line options to pytest."""
parser.addoption(
"--only-extended",
@@ -21,7 +20,9 @@ def pytest_addoption(parser: Parser) -> None:
)
def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None:
def pytest_collection_modifyitems(
config: pytest.Config, items: Sequence[pytest.Function]
) -> None:
"""Add implementations for handling custom markers.
At the moment, this adds support for a custom `requires` marker.
@@ -64,7 +65,7 @@ def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) ->
if pkg not in required_pkgs_info:
try:
installed = util.find_spec(pkg) is not None
except Exception:
except (ImportError, ValueError):
installed = False
required_pkgs_info[pkg] = installed

View File

@@ -1,11 +1,14 @@
"""Test text splitting functionality."""
from __future__ import annotations
import random
import re
import string
from typing import Any, Callable
import pytest
from bs4 import Tag
from langchain_core.documents import Document
from langchain_text_splitters import (
@@ -103,7 +106,9 @@ def test_character_text_splitter_longer_words() -> None:
def test_character_text_splitter_keep_separator_regex(
*, separator: str, is_separator_regex: bool
) -> None:
"""Test splitting by characters while keeping the separator
"""Test CharacterTextSplitter keep separator regex.
Test splitting by characters while keeping the separator
that is a regex special character.
"""
text = "foo.bar.baz.123"
@@ -125,7 +130,9 @@ def test_character_text_splitter_keep_separator_regex(
def test_character_text_splitter_keep_separator_regex_start(
*, separator: str, is_separator_regex: bool
) -> None:
"""Test splitting by characters while keeping the separator
"""Test CharacterTextSplitter keep separator regex and put at start.
Test splitting by characters while keeping the separator
that is a regex special character and placing it at the start of each chunk.
"""
text = "foo.bar.baz.123"
@@ -147,7 +154,9 @@ def test_character_text_splitter_keep_separator_regex_start(
def test_character_text_splitter_keep_separator_regex_end(
*, separator: str, is_separator_regex: bool
) -> None:
"""Test splitting by characters while keeping the separator
"""Test CharacterTextSplitter keep separator regex and put at end.
Test splitting by characters while keeping the separator
that is a regex special character and placing it at the end of each chunk.
"""
text = "foo.bar.baz.123"
@@ -169,8 +178,11 @@ def test_character_text_splitter_keep_separator_regex_end(
def test_character_text_splitter_discard_separator_regex(
*, separator: str, is_separator_regex: bool
) -> None:
"""Test splitting by characters discarding the separator
that is a regex special character."""
"""Test CharacterTextSplitter discard separator regex.
Test splitting by characters discarding the separator
that is a regex special character.
"""
text = "foo.bar.baz.123"
splitter = CharacterTextSplitter(
separator=separator,
@@ -210,12 +222,17 @@ def test_recursive_character_text_splitter_keep_separators() -> None:
def test_character_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
with pytest.raises(
ValueError,
match=re.escape(
"Got a larger chunk overlap (4) than chunk size (2), should be smaller."
),
):
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)
for invalid_size in (0, -1):
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="chunk_size must be > 0, got"):
CharacterTextSplitter(chunk_size=invalid_size)
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="chunk_overlap must be >= 0, got -1"):
CharacterTextSplitter(chunk_size=2, chunk_overlap=-1)
@@ -1164,7 +1181,6 @@ def test_html_code_splitter() -> None:
def test_md_header_text_splitter_1() -> None:
"""Test markdown splitter by header: Case 1."""
markdown_document = (
"# Foo\n\n"
" ## Bar\n\n"
@@ -1235,7 +1251,6 @@ def test_md_header_text_splitter_2() -> None:
def test_md_header_text_splitter_3() -> None:
"""Test markdown splitter by header: Case 3."""
markdown_document = (
"# Foo\n\n"
" ## Bar\n\n"
@@ -1290,7 +1305,6 @@ def test_md_header_text_splitter_3() -> None:
def test_md_header_text_splitter_preserve_headers_1() -> None:
"""Test markdown splitter by header: Preserve Headers."""
markdown_document = (
"# Foo\n\n"
" ## Bat\n\n"
@@ -1324,7 +1338,6 @@ def test_md_header_text_splitter_preserve_headers_1() -> None:
def test_md_header_text_splitter_preserve_headers_2() -> None:
"""Test markdown splitter by header: Preserve Headers."""
markdown_document = (
"# Foo\n\n"
" ## Bar\n\n"
@@ -1372,7 +1385,6 @@ def test_md_header_text_splitter_preserve_headers_2() -> None:
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
"""Test markdown splitter by header: Fenced code block."""
markdown_document = (
f"# This is a Header\n\n{fence}\nfoo()\n# Not a header\nbar()\n{fence}"
)
@@ -1402,7 +1414,6 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
fence: str, other_fence: str
) -> None:
"""Test markdown splitter by header: Interleaved fenced code block."""
markdown_document = (
"# This is a Header\n\n"
f"{fence}\n"
@@ -1438,7 +1449,6 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
@pytest.mark.parametrize("characters", ["\ufeff"])
def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:
"""Test markdown splitter by header: Fenced code block."""
markdown_document = f"{characters}# Foo\n\nfoo()\n{characters}## Bar\n\nbar()"
headers_to_split_on = [
@@ -1609,7 +1619,6 @@ EXPERIMENTAL_MARKDOWN_DOCUMENT = (
def test_experimental_markdown_syntax_text_splitter() -> None:
"""Test experimental markdown syntax splitter."""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
@@ -1663,7 +1672,6 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:
"""Test experimental markdown syntax splitter."""
headers_to_split_on = [("#", "Encabezamiento 1")]
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
@@ -1709,7 +1717,6 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
"""Test experimental markdown syntax splitter."""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
@@ -1768,7 +1775,6 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
"""Test experimental markdown syntax splitter."""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
@@ -1876,8 +1882,11 @@ EXPERIMENTAL_MARKDOWN_DOCUMENTS = [
def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
"""Test experimental markdown syntax splitter split
on default called consecutively on two files."""
"""Test ExperimentalMarkdownSyntaxTextSplitter on multiple files.
Test experimental markdown syntax splitter split on default called consecutively
on two files.
"""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
output = []
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
@@ -1958,8 +1967,11 @@ def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> (
None
):
"""Test experimental markdown syntax splitter split
on each line called consecutively on two files."""
"""Test ExperimentalMarkdownSyntaxTextSplitter split lines on multiple files.
Test experimental markdown syntax splitter split on each line called consecutively
on two files.
"""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
output = []
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
@@ -2083,9 +2095,10 @@ def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files()
def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> (
None
):
"""Test experimental markdown splitter
by header called consecutively on two files"""
"""Test ExperimentalMarkdownSyntaxTextSplitter with header on multiple files.
Test experimental markdown splitter by header called consecutively on two files.
"""
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
output = []
for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
@@ -2171,9 +2184,11 @@ def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files()
def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> (
None
):
"""Test experimental markdown splitter
by header configuration called consecutively on two files"""
"""Test ExperimentalMarkdownSyntaxTextSplitter header config on multiple files.
Test experimental markdown splitter by header configuration called consecutively
on two files.
"""
headers_to_split_on = [("#", "Encabezamiento 1")]
markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
headers_to_split_on=headers_to_split_on
@@ -2354,8 +2369,8 @@ def test_haskell_code_splitter() -> None:
def html_header_splitter_splitter_factory() -> Callable[
[list[tuple[str, str]]], HTMLHeaderTextSplitter
]:
"""
Fixture to create an HTMLHeaderTextSplitter instance with given headers.
"""Fixture to create an HTMLHeaderTextSplitter instance with given headers.
This factory allows dynamic creation of splitters with different headers.
"""
@@ -2553,14 +2568,15 @@ def html_header_splitter_splitter_factory() -> Callable[
)
@pytest.mark.requires("bs4")
def test_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
html_header_splitter_splitter_factory: Callable[
[list[tuple[str, str]]], HTMLHeaderTextSplitter
],
headers_to_split_on: list[tuple[str, str]],
html_input: str,
expected_documents: list[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
"""Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
@@ -2574,10 +2590,7 @@ def test_html_header_text_splitter(
AssertionError: If the number of documents or their content/metadata
does not match the expected values.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
docs = splitter.split_text(html_input)
assert len(docs) == len(expected_documents), (
@@ -2709,14 +2722,15 @@ def test_html_header_text_splitter(
)
@pytest.mark.requires("bs4")
def test_additional_html_header_text_splitter(
html_header_splitter_splitter_factory: Any,
html_header_splitter_splitter_factory: Callable[
[list[tuple[str, str]]], HTMLHeaderTextSplitter
],
headers_to_split_on: list[tuple[str, str]],
html_content: str,
expected_output: list[Document],
test_case: str,
) -> None:
"""
Test the HTML header text splitter.
"""Test the HTML header text splitter.
Args:
html_header_splitter_splitter_factory (Any): Factory function to create
@@ -2730,9 +2744,7 @@ def test_additional_html_header_text_splitter(
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
@@ -2780,14 +2792,16 @@ def test_additional_html_header_text_splitter(
)
@pytest.mark.requires("bs4")
def test_html_no_headers_with_multiple_splitters(
html_header_splitter_splitter_factory: Any,
html_header_splitter_splitter_factory: Callable[
[list[tuple[str, str]]], HTMLHeaderTextSplitter
],
headers_to_split_on: list[tuple[str, str]],
html_content: str,
expected_output: list[Document],
test_case: str,
) -> None:
"""
Test HTML content splitting without headers using multiple splitters.
"""Test HTML content splitting without headers using multiple splitters.
Args:
html_header_splitter_splitter_factory (Any): Factory to create the
HTML header splitter.
@@ -2796,13 +2810,12 @@ def test_html_no_headers_with_multiple_splitters(
expected_output (List[Document]): Expected list of Document objects
after splitting.
test_case (str): Description of the test case.
Raises:
AssertionError: If the number of documents or their content/metadata
does not match the expected output.
"""
splitter = html_header_splitter_splitter_factory(
headers_to_split_on=headers_to_split_on
)
splitter = html_header_splitter_splitter_factory(headers_to_split_on)
docs = splitter.split_text(html_content)
assert len(docs) == len(expected_output), (
@@ -3046,7 +3059,7 @@ def test_happy_path_splitting_with_duplicate_header_tag() -> None:
def test_split_json() -> None:
"""Test json text splitter"""
"""Test json text splitter."""
max_chunk = 800
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
@@ -3068,7 +3081,7 @@ def test_split_json() -> None:
def test_split_json_with_lists() -> None:
"""Test json text splitter with list conversion"""
"""Test json text splitter with list conversion."""
max_chunk = 800
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
@@ -3240,7 +3253,7 @@ def test_visualbasic6_code_splitter() -> None:
]
def custom_iframe_extractor(iframe_tag: Any) -> str:
def custom_iframe_extractor(iframe_tag: Tag) -> str:
iframe_src = iframe_tag.get("src", "")
return f"[iframe:{iframe_src}]({iframe_src})"
@@ -3324,8 +3337,11 @@ def test_html_splitter_with_nested_elements() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_with_preserved_elements() -> None:
"""Test HTML splitting with preserved elements like <table>, <ul> with low chunk
size."""
"""Test HTML splitter with preserved elements.
Test HTML splitting with preserved elements like <table>, <ul> with low chunk
size.
"""
html_content = """
<h1>Section 1</h1>
<table>
@@ -3563,8 +3579,11 @@ def test_html_splitter_with_no_headers() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_with_media_preservation() -> None:
"""Test HTML splitting with media elements preserved and converted to Markdown-like
links."""
"""Test HTML splitter with media preservation.
Test HTML splitting with media elements preserved and converted to Markdown-like
links.
"""
html_content = """
<h1>Section 1</h1>
<p>This is an image:</p>
@@ -3600,7 +3619,7 @@ def test_html_splitter_with_media_preservation() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_true() -> None:
"""Test HTML splitting with keep_separator=True"""
"""Test HTML splitting with keep_separator=True."""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
@@ -3629,7 +3648,7 @@ def test_html_splitter_keep_separator_true() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_false() -> None:
"""Test HTML splitting with keep_separator=False"""
"""Test HTML splitting with keep_separator=False."""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
@@ -3658,7 +3677,7 @@ def test_html_splitter_keep_separator_false() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_start() -> None:
"""Test HTML splitting with keep_separator="start" """
"""Test HTML splitting with keep_separator="start"."""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
@@ -3687,7 +3706,7 @@ def test_html_splitter_keep_separator_start() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_end() -> None:
"""Test HTML splitting with keep_separator="end" """
"""Test HTML splitting with keep_separator="end"."""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>
@@ -3716,7 +3735,7 @@ def test_html_splitter_keep_separator_end() -> None:
@pytest.mark.requires("bs4")
def test_html_splitter_keep_separator_default() -> None:
"""Test HTML splitting with keep_separator not set"""
"""Test HTML splitting with keep_separator not set."""
html_content = """
<h1>Section 1</h1>
<p>This is some text. This is some other text.</p>