mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-13 21:47:12 +00:00
text-splitters: Inconsistent results with NLTKTextSplitter
's add_start_index=True
(#27782)
This PR closes #27781 # Problem The current implementation of `NLTKTextSplitter` is using `sent_tokenize`. However, this `sent_tokenize` doesn't handle chars between 2 tokenized sentences... hence, this behavior throws errors when we are using `add_start_index=True`, as described in issue #27781. In particular: ```python from nltk.tokenize import sent_tokenize output1 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output1) output2 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output2) >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] ``` # Solution With this new `use_span_tokenize` parameter, we can use NLTK to create sentences (with `span_tokenize`), but also add extra chars to be sure that we still can map the chunks to the original text. --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Erick Friis <erickfriis@gmail.com>
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
"""Test text splitters that require an integration."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_text_splitters import (
|
||||
@@ -11,6 +13,15 @@ from langchain_text_splitters.sentence_transformers import (
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def sentence_transformers() -> Any:
|
||||
try:
|
||||
import sentence_transformers
|
||||
except ImportError:
|
||||
pytest.skip("SentenceTransformers not installed.")
|
||||
return sentence_transformers
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
"""Test that type checks are done properly on input."""
|
||||
with pytest.raises(ValueError):
|
||||
@@ -52,7 +63,7 @@ def test_token_text_splitter_from_tiktoken() -> None:
|
||||
assert expected_tokenizer == actual_tokenizer
|
||||
|
||||
|
||||
def test_sentence_transformers_count_tokens() -> None:
|
||||
def test_sentence_transformers_count_tokens(sentence_transformers: Any) -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
@@ -67,7 +78,7 @@ def test_sentence_transformers_count_tokens() -> None:
|
||||
assert expected_token_count == token_count
|
||||
|
||||
|
||||
def test_sentence_transformers_split_text() -> None:
|
||||
def test_sentence_transformers_split_text(sentence_transformers: Any) -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-albert-small-v2"
|
||||
)
|
||||
@@ -77,7 +88,7 @@ def test_sentence_transformers_split_text() -> None:
|
||||
assert expected_text_chunks == text_chunks
|
||||
|
||||
|
||||
def test_sentence_transformers_multiple_tokens() -> None:
|
||||
def test_sentence_transformers_multiple_tokens(sentence_transformers: Any) -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
|
||||
text = "Lorem "
|
||||
|
||||
|
Reference in New Issue
Block a user