text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346)

2025-09-24 20:09:01 +00:00 · 2024-02-29 18:33:21 -08:00
parent 7891934173
commit 5efb5c099f
226 changed files with 6626 additions and 1980 deletions
--- a/libs/text-splitters/tests/init.py
+++ b/libs/text-splitters/tests/init.py
--- a/libs/text-splitters/tests/integration_tests/init.py
+++ b/libs/text-splitters/tests/integration_tests/init.py
--- a/libs/text-splitters/tests/integration_tests/test_compile.py
+++ b/libs/text-splitters/tests/integration_tests/test_compile.py
@@ -0,0 +1,7 @@
+import pytest
+
+
+@pytest.mark.compile
+def test_placeholder() -> None:
+    """Used for compiling integration tests without running any real tests."""
+    pass
--- a/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py
+++ b/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py
@@ -0,0 +1,38 @@
+"""Test text splitting functionality using NLTK and Spacy based sentence splitters."""
+import pytest
+
+from langchain_text_splitters.nltk import NLTKTextSplitter
+from langchain_text_splitters.spacy import SpacyTextSplitter
+
+
+def test_nltk_text_splitting_args() -> None:
+    """Test invalid arguments."""
+    with pytest.raises(ValueError):
+        NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
+
+
+def test_spacy_text_splitting_args() -> None:
+    """Test invalid arguments."""
+    with pytest.raises(ValueError):
+        SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
+
+
+def test_nltk_text_splitter() -> None:
+    """Test splitting by sentence using NLTK."""
+    text = "This is sentence one. And this is sentence two."
+    separator = "|||"
+    splitter = NLTKTextSplitter(separator=separator)
+    output = splitter.split_text(text)
+    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
+    assert output == expected_output
+
+
+@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
+def test_spacy_text_splitter(pipeline: str) -> None:
+    """Test splitting by sentence using Spacy."""
+    text = "This is sentence one. And this is sentence two."
+    separator = "|||"
+    splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
+    output = splitter.split_text(text)
+    expected_output = [f"This is sentence one.{separator}And this is sentence two."]
+    assert output == expected_output
--- a/libs/text-splitters/tests/integration_tests/test_text_splitter.py
+++ b/libs/text-splitters/tests/integration_tests/test_text_splitter.py
@@ -0,0 +1,109 @@
+"""Test text splitters that require an integration."""
+
+import pytest
+
+from langchain_text_splitters import (
+    TokenTextSplitter,
+)
+from langchain_text_splitters.character import CharacterTextSplitter
+from langchain_text_splitters.sentence_transformers import (
+    SentenceTransformersTokenTextSplitter,
+)
+
+
+def test_huggingface_type_check() -> None:
+    """Test that type checks are done properly on input."""
+    with pytest.raises(ValueError):
+        CharacterTextSplitter.from_huggingface_tokenizer("foo")
+
+
+def test_huggingface_tokenizer() -> None:
+    """Test text splitter that uses a HuggingFace tokenizer."""
+    from transformers import GPT2TokenizerFast
+
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
+        tokenizer, separator=" ", chunk_size=1, chunk_overlap=0
+    )
+    output = text_splitter.split_text("foo bar")
+    assert output == ["foo", "bar"]
+
+
+def test_token_text_splitter() -> None:
+    """Test no overlap."""
+    splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=0)
+    output = splitter.split_text("abcdef" * 5)  # 10 token string
+    expected_output = ["abcdefabcdefabc", "defabcdefabcdef"]
+    assert output == expected_output
+
+
+def test_token_text_splitter_overlap() -> None:
+    """Test with overlap."""
+    splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=1)
+    output = splitter.split_text("abcdef" * 5)  # 10 token string
+    expected_output = ["abcdefabcdefabc", "abcdefabcdefabc", "abcdef"]
+    assert output == expected_output
+
+
+def test_token_text_splitter_from_tiktoken() -> None:
+    splitter = TokenTextSplitter.from_tiktoken_encoder(model_name="gpt-3.5-turbo")
+    expected_tokenizer = "cl100k_base"
+    actual_tokenizer = splitter._tokenizer.name
+    assert expected_tokenizer == actual_tokenizer
+
+
+def test_sentence_transformers_count_tokens() -> None:
+    splitter = SentenceTransformersTokenTextSplitter(
+        model_name="sentence-transformers/paraphrase-albert-small-v2"
+    )
+    text = "Lorem ipsum"
+
+    token_count = splitter.count_tokens(text=text)
+
+    expected_start_stop_token_count = 2
+    expected_text_token_count = 5
+    expected_token_count = expected_start_stop_token_count + expected_text_token_count
+
+    assert expected_token_count == token_count
+
+
+def test_sentence_transformers_split_text() -> None:
+    splitter = SentenceTransformersTokenTextSplitter(
+        model_name="sentence-transformers/paraphrase-albert-small-v2"
+    )
+    text = "lorem ipsum"
+    text_chunks = splitter.split_text(text=text)
+    expected_text_chunks = [text]
+    assert expected_text_chunks == text_chunks
+
+
+def test_sentence_transformers_multiple_tokens() -> None:
+    splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
+    text = "Lorem "
+
+    text_token_count_including_start_and_stop_tokens = splitter.count_tokens(text=text)
+    count_start_and_end_tokens = 2
+    token_multiplier = (
+        count_start_and_end_tokens
+        + (splitter.maximum_tokens_per_chunk - count_start_and_end_tokens)
+        // (
+            text_token_count_including_start_and_stop_tokens
+            - count_start_and_end_tokens
+        )
+        + 1
+    )
+
+    # `text_to_split` does not fit in a single chunk
+    text_to_embed = text * token_multiplier
+
+    text_chunks = splitter.split_text(text=text_to_embed)
+
+    expected_number_of_chunks = 2
+
+    assert expected_number_of_chunks == len(text_chunks)
+    actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens
+    expected = (
+        token_multiplier * (text_token_count_including_start_and_stop_tokens - 2)
+        - splitter.maximum_tokens_per_chunk
+    )
+    assert expected == actual
--- a/libs/text-splitters/tests/unit_tests/init.py
+++ b/libs/text-splitters/tests/unit_tests/init.py
--- a/libs/text-splitters/tests/unit_tests/conftest.py
+++ b/libs/text-splitters/tests/unit_tests/conftest.py
@@ -0,0 +1,87 @@
+"""Configuration for unit tests."""
+from importlib import util
+from typing import Dict, Sequence
+
+import pytest
+from pytest import Config, Function, Parser
+
+
+def pytest_addoption(parser: Parser) -> None:
+    """Add custom command line options to pytest."""
+    parser.addoption(
+        "--only-extended",
+        action="store_true",
+        help="Only run extended tests. Does not allow skipping any extended tests.",
+    )
+    parser.addoption(
+        "--only-core",
+        action="store_true",
+        help="Only run core tests. Never runs any extended tests.",
+    )
+
+
+def pytest_collection_modifyitems(config: Config, items: Sequence[Function]) -> None:
+    """Add implementations for handling custom markers.
+
+    At the moment, this adds support for a custom `requires` marker.
+
+    The `requires` marker is used to denote tests that require one or more packages
+    to be installed to run. If the package is not installed, the test is skipped.
+
+    The `requires` marker syntax is:
+
+    .. code-block:: python
+
+        @pytest.mark.requires("package1", "package2")
+        def test_something():
+            ...
+    """
+    # Mapping from the name of a package to whether it is installed or not.
+    # Used to avoid repeated calls to `util.find_spec`
+    required_pkgs_info: Dict[str, bool] = {}
+
+    only_extended = config.getoption("--only-extended") or False
+    only_core = config.getoption("--only-core") or False
+
+    if only_extended and only_core:
+        raise ValueError("Cannot specify both `--only-extended` and `--only-core`.")
+
+    for item in items:
+        requires_marker = item.get_closest_marker("requires")
+        if requires_marker is not None:
+            if only_core:
+                item.add_marker(pytest.mark.skip(reason="Skipping not a core test."))
+                continue
+
+            # Iterate through the list of required packages
+            required_pkgs = requires_marker.args
+            for pkg in required_pkgs:
+                # If we haven't yet checked whether the pkg is installed
+                # let's check it and store the result.
+                if pkg not in required_pkgs_info:
+                    try:
+                        installed = util.find_spec(pkg) is not None
+                    except Exception:
+                        installed = False
+                    required_pkgs_info[pkg] = installed
+
+                if not required_pkgs_info[pkg]:
+                    if only_extended:
+                        pytest.fail(
+                            f"Package `{pkg}` is not installed but is required for "
+                            f"extended tests. Please install the given package and "
+                            f"try again.",
+                        )
+
+                    else:
+                        # If the package is not installed, we immediately break
+                        # and mark the test as skipped.
+                        item.add_marker(
+                            pytest.mark.skip(reason=f"Requires pkg: `{pkg}`")
+                        )
+                        break
+        else:
+            if only_extended:
+                item.add_marker(
+                    pytest.mark.skip(reason="Skipping not an extended test.")
+                )
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py