mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 23:13:31 +00:00
refactor: extract token text splitter function (#5179)
# Token text splitter for sentence transformers The current TokenTextSplitter only works with OpenAi models via the `tiktoken` package. This is not clear from the name `TokenTextSplitter`. In this (first PR) a token based text splitter for sentence transformer models is added. In the future I think we should work towards injecting a tokenizer into the TokenTextSplitter to make ti more flexible. Could perhaps be reviewed by @dev2049 --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
@@ -2,7 +2,11 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
|
||||
from langchain.text_splitter import (
|
||||
CharacterTextSplitter,
|
||||
SentenceTransformersTokenTextSplitter,
|
||||
TokenTextSplitter,
|
||||
)
|
||||
|
||||
|
||||
def test_huggingface_type_check() -> None:
|
||||
@@ -44,3 +48,45 @@ def test_token_text_splitter_from_tiktoken() -> None:
|
||||
expected_tokenizer = "cl100k_base"
|
||||
actual_tokenizer = splitter._tokenizer.name
|
||||
assert expected_tokenizer == actual_tokenizer
|
||||
|
||||
|
||||
def test_sentence_transformers_count_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||
)
|
||||
text = "Lorem ipsum"
|
||||
|
||||
token_count = splitter.count_tokens(text=text)
|
||||
|
||||
expected_start_stop_token_count = 2
|
||||
expected_text_token_count = 2
|
||||
expected_token_count = expected_start_stop_token_count + expected_text_token_count
|
||||
|
||||
assert expected_token_count == token_count
|
||||
|
||||
|
||||
def test_sentence_transformers_split_text() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(
|
||||
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||
)
|
||||
text = "Lorem ipsum"
|
||||
text_chunks = splitter.split_text(text=text)
|
||||
expected_text_chunks = [text]
|
||||
assert expected_text_chunks == text_chunks
|
||||
|
||||
|
||||
def test_sentence_transformers_multiple_tokens() -> None:
|
||||
splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)
|
||||
text = "Lorem "
|
||||
|
||||
count_start_and_end_tokens = 2
|
||||
text_token_count = splitter.count_tokens(text=text) - count_start_and_end_tokens
|
||||
token_multiplier = splitter.maximum_tokens_per_chunk // text_token_count + 1
|
||||
text_chunks = splitter.split_text(text=text * token_multiplier)
|
||||
|
||||
expected_number_of_chunks = 2
|
||||
|
||||
assert expected_number_of_chunks == len(text_chunks)
|
||||
actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens
|
||||
expected = token_multiplier * text_token_count - splitter.maximum_tokens_per_chunk
|
||||
assert expected == actual
|
||||
|
Reference in New Issue
Block a user