smart text splitter (#530)

smart text splitter that iteratively tries different separators until it
works!
This commit is contained in:
Harrison Chase
2023-01-08 15:11:10 -08:00
committed by GitHub
parent 8dfad874a2
commit 1192cc0767
3 changed files with 172 additions and 21 deletions

View File

@@ -2,7 +2,10 @@
import pytest
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
def test_character_text_splitter() -> None:
@@ -23,6 +26,15 @@ def test_character_text_splitter_long() -> None:
assert output == expected_output
def test_character_text_splitter_short_words_first() -> None:
"""Test splitting by character count when shorter words are first."""
text = "a a foo bar baz"
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
output = splitter.split_text(text)
expected_output = ["a a", "foo", "bar", "baz"]
assert output == expected_output
def test_character_text_splitter_longer_words() -> None:
"""Test splitting by characters when splits not found easily."""
text = "foo bar baz 123"
@@ -62,3 +74,33 @@ def test_create_documents_with_metadata() -> None:
Document(page_content="baz", metadata={"source": "2"}),
]
assert docs == expected_docs
def test_iterative_text_splitter() -> None:
"""Test iterative text splitter."""
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
This is a weird text to write, but gotta test the splittingggg some how.
Bye!\n\n-H."""
splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=1)
output = splitter.split_text(text)
expected_output = [
"Hi.",
"I'm",
"Harrison.",
"How? Are?",
"You?",
"Okay then f",
"f f f f.",
"This is a",
"a weird",
"text to",
"write, but",
"gotta test",
"the",
"splitting",
"gggg",
"some how.",
"Bye!\n\n-H.",
]
assert output == expected_output