From 5b3e29f8092570a457d7b817db5e3bed95f72341 Mon Sep 17 00:00:00 2001 From: Michael Li Date: Wed, 9 Jul 2025 02:22:33 +1000 Subject: [PATCH] text splitters: add chunk_size and chunk_overlap validations (#31916) Thank you for contributing to LangChain! - [x] **PR title**: "package: description" - Where "package" is whichever of langchain, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "core: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. If no one reviews your PR within a few days, please @-mention one of baskaryan, eyurtsev, ccurme, vbarda, hwchase17. --- libs/text-splitters/langchain_text_splitters/base.py | 6 ++++++ libs/text-splitters/tests/unit_tests/test_text_splitters.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index f9e4a92222a..8861f4c6585 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -47,6 +47,12 @@ class TextSplitter(BaseDocumentTransformer, ABC): strip_whitespace: If `True`, strips whitespace from the start and end of every document """ + if chunk_size <= 0: + msg = f"chunk_size must be > 0, got {chunk_size}" + raise ValueError(msg) + if chunk_overlap < 0: + msg = f"chunk_overlap must be >= 0, got {chunk_overlap}" + raise ValueError(msg) if chunk_overlap > chunk_size: msg = ( f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 6f03a2d59f0..1d3304d1b84 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -212,6 +212,11 @@ def test_character_text_splitting_args() -> None: """Test invalid arguments.""" with pytest.raises(ValueError): CharacterTextSplitter(chunk_size=2, chunk_overlap=4) + for invalid_size in (0, -1): + with pytest.raises(ValueError): + CharacterTextSplitter(chunk_size=invalid_size) + with pytest.raises(ValueError): + CharacterTextSplitter(chunk_size=2, chunk_overlap=-1) def test_merge_splits() -> None: