Harrison/text splitter (#5417)

adds support for keeping separators around when using recursive text
splitter
This commit is contained in:
Harrison Chase
2023-05-29 16:56:31 -07:00
committed by GitHub
parent cf5803e44c
commit 72f99ff953
3 changed files with 99 additions and 59 deletions

View File

@@ -42,17 +42,17 @@
" \n",
"def foo():\n",
"\n",
"def testing_func():\n",
"def testing_func_with_long_name():\n",
"\n",
"def bar():\n",
"\"\"\"\n",
"python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
"python_splitter = PythonCodeTextSplitter(chunk_size=40, chunk_overlap=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6cdc55f3",
"id": "8cc33770",
"metadata": {},
"outputs": [],
"source": [
@@ -62,15 +62,16 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "8cc33770",
"id": "f5f70775",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Foo:\\n\\n def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
"[Document(page_content='class Foo:\\n\\n def bar():', metadata={}),\n",
" Document(page_content='def foo():', metadata={}),\n",
" Document(page_content='def testing_func_with_long_name():', metadata={}),\n",
" Document(page_content='def bar():', metadata={})]"
]
},
"execution_count": 4,
@@ -82,33 +83,10 @@
"docs"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "de625e08-c440-489d-beed-020b6c53bf69",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['Foo:\\n\\n def bar():', 'foo():\\n\\ndef testing_func():', 'bar():']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"python_splitter.split_text(python_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55aadd84-75ca-48ae-9b84-b39c368488ed",
"id": "6e096d42",
"metadata": {},
"outputs": [],
"source": []
@@ -130,7 +108,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.9.1"
},
"vscode": {
"interpreter": {