mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-11 15:35:09 +00:00
text splitters: add chunk_size and chunk_overlap validations (#31916)
Thank you for contributing to LangChain! - [x] **PR title**: "package: description" - Where "package" is whichever of langchain, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "core: add foobar LLM" - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. If no one reviews your PR within a few days, please @-mention one of baskaryan, eyurtsev, ccurme, vbarda, hwchase17.
This commit is contained in:
parent
0a17a62548
commit
5b3e29f809
@ -47,6 +47,12 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
strip_whitespace: If `True`, strips whitespace from the start and end of
|
||||
every document
|
||||
"""
|
||||
if chunk_size <= 0:
|
||||
msg = f"chunk_size must be > 0, got {chunk_size}"
|
||||
raise ValueError(msg)
|
||||
if chunk_overlap < 0:
|
||||
msg = f"chunk_overlap must be >= 0, got {chunk_overlap}"
|
||||
raise ValueError(msg)
|
||||
if chunk_overlap > chunk_size:
|
||||
msg = (
|
||||
f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
|
||||
|
@ -212,6 +212,11 @@ def test_character_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
for invalid_size in (0, -1):
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter(chunk_size=invalid_size)
|
||||
with pytest.raises(ValueError):
|
||||
CharacterTextSplitter(chunk_size=2, chunk_overlap=-1)
|
||||
|
||||
|
||||
def test_merge_splits() -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user