MD header text splitter returns Documents (#6571)

Return `Documents` from MD header text splitter to simplify UX.

Updates the test as well as example notebooks.
This commit is contained in:
Lance Martin
2023-06-22 09:25:38 -07:00
committed by GitHub
parent 3436da65a4
commit 30f7288082
4 changed files with 116 additions and 190 deletions

View File

@@ -50,8 +50,8 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "19c044f0",
"execution_count": 2,
"id": "ceb3c1fb",
"metadata": {},
"outputs": [],
"source": [
@@ -65,13 +65,16 @@
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
]
"data": {
"text/plain": [
"[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
" Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
" Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
@@ -85,8 +88,28 @@
"\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"for split in md_header_splits:\n",
" print(split)"
"md_header_splits"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aac1738c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"langchain.schema.Document"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(md_header_splits[0])"
]
},
{
@@ -99,10 +122,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "480e0e3a",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
"\n",
@@ -117,60 +155,13 @@
"\n",
"# Char-level splits\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"chunk_size = 10\n",
"chunk_overlap = 0\n",
"chunk_size = 250\n",
"chunk_overlap = 30\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
"\n",
"# Split within each header group\n",
"all_splits=[]\n",
"all_metadatas=[] \n",
"for header_group in md_header_splits:\n",
" _splits = text_splitter.split_text(header_group['content'])\n",
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
" all_splits += _splits\n",
" all_metadatas += _metadatas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3f5d775e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Markdown[9'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_splits[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "33ab0d5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Header 1': 'Intro', 'Header 2': 'History'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_metadatas[0]"
"# Split\n",
"splits = text_splitter.split_documents(md_header_splits)\n",
"splits"
]
}
],