mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 06:53:16 +00:00
Harrison/text splitter (#5417)
adds support for keeping separators around when using recursive text splitter
This commit is contained in:
@@ -4,9 +4,23 @@ import pytest
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import (
|
||||
CharacterTextSplitter,
|
||||
PythonCodeTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
)
|
||||
|
||||
FAKE_PYTHON_TEXT = """
|
||||
class Foo:
|
||||
|
||||
def bar():
|
||||
|
||||
|
||||
def foo():
|
||||
|
||||
def testing_func():
|
||||
|
||||
def bar():
|
||||
"""
|
||||
|
||||
|
||||
def test_character_text_splitter() -> None:
|
||||
"""Test splitting by character count."""
|
||||
@@ -135,15 +149,16 @@ Bye!\n\n-H."""
|
||||
"Okay then",
|
||||
"f f f f.",
|
||||
"This is a",
|
||||
"a weird",
|
||||
"weird",
|
||||
"text to",
|
||||
"write, but",
|
||||
"gotta test",
|
||||
"the",
|
||||
"splittingg",
|
||||
"ggg",
|
||||
"write,",
|
||||
"but gotta",
|
||||
"test the",
|
||||
"splitting",
|
||||
"gggg",
|
||||
"some how.",
|
||||
"Bye!\n\n-H.",
|
||||
"Bye!",
|
||||
"-H.",
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
@@ -168,3 +183,14 @@ def test_split_documents() -> None:
|
||||
Document(page_content="z", metadata={"source": "1"}),
|
||||
]
|
||||
assert splitter.split_documents(docs) == expected_output
|
||||
|
||||
|
||||
def test_python_text_splitter() -> None:
|
||||
splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)
|
||||
splits = splitter.split_text(FAKE_PYTHON_TEXT)
|
||||
split_0 = """class Foo:\n\n def bar():"""
|
||||
split_1 = """def foo():"""
|
||||
split_2 = """def testing_func():"""
|
||||
split_3 = """def bar():"""
|
||||
expected_splits = [split_0, split_1, split_2, split_3]
|
||||
assert splits == expected_splits
|
||||
|
Reference in New Issue
Block a user