Harrison/text splitter (#5417)

adds support for keeping separators around when using recursive text
splitter
This commit is contained in:
Harrison Chase
2023-05-29 16:56:31 -07:00
committed by GitHub
parent cf5803e44c
commit 72f99ff953
3 changed files with 99 additions and 59 deletions

View File

@@ -4,9 +4,23 @@ import pytest
from langchain.docstore.document import Document
from langchain.text_splitter import (
CharacterTextSplitter,
PythonCodeTextSplitter,
RecursiveCharacterTextSplitter,
)
FAKE_PYTHON_TEXT = """
class Foo:
def bar():
def foo():
def testing_func():
def bar():
"""
def test_character_text_splitter() -> None:
"""Test splitting by character count."""
@@ -135,15 +149,16 @@ Bye!\n\n-H."""
"Okay then",
"f f f f.",
"This is a",
"a weird",
"weird",
"text to",
"write, but",
"gotta test",
"the",
"splittingg",
"ggg",
"write,",
"but gotta",
"test the",
"splitting",
"gggg",
"some how.",
"Bye!\n\n-H.",
"Bye!",
"-H.",
]
assert output == expected_output
@@ -168,3 +183,14 @@ def test_split_documents() -> None:
Document(page_content="z", metadata={"source": "1"}),
]
assert splitter.split_documents(docs) == expected_output
def test_python_text_splitter() -> None:
splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_PYTHON_TEXT)
split_0 = """class Foo:\n\n def bar():"""
split_1 = """def foo():"""
split_2 = """def testing_func():"""
split_3 = """def bar():"""
expected_splits = [split_0, split_1, split_2, split_3]
assert splits == expected_splits