MD header text splitter returns Documents (#6571)

Return `Documents` from MD header text splitter to simplify UX. Updates the test as well as example notebooks.
2025-09-06 21:43:44 +00:00 · 2023-06-22 09:25:38 -07:00
parent 3436da65a4
commit 30f7288082
4 changed files with 116 additions and 190 deletions
--- a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb
+++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb
@@ -50,8 +50,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "id": "19c044f0",
+   "execution_count": 2,
+   "id": "ceb3c1fb",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -65,13 +65,16 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'content': 'Hi this is Jim  \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
-      "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
-      "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
-     ]
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Hi this is Jim  \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
+       " Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
+       " Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
@@ -85,8 +88,28 @@
    "\n",
    "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
    "md_header_splits = markdown_splitter.split_text(markdown_document)\n",
-    "for split in md_header_splits:\n",
-    "    print(split)"
+    "md_header_splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "aac1738c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "langchain.schema.Document"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(md_header_splits[0])"
   ]
  },
  {
@@ -99,10 +122,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
   "id": "480e0e3a",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
+       " Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
+       " Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.  \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
+       " Document(page_content='#### Standardization  \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
+       " Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "markdown_document = \"# Intro \\n\\n    ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
    "\n",
@@ -117,60 +155,13 @@
    "\n",
    "# Char-level splits\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
-    "chunk_size = 10\n",
-    "chunk_overlap = 0\n",
+    "chunk_size = 250\n",
+    "chunk_overlap = 30\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
    "\n",
-    "# Split within each header group\n",
-    "all_splits=[]\n",
-    "all_metadatas=[]    \n",
-    "for header_group in md_header_splits:\n",
-    "    _splits = text_splitter.split_text(header_group['content'])\n",
-    "    _metadatas = [header_group['metadata'] for _ in _splits]\n",
-    "    all_splits += _splits\n",
-    "    all_metadatas += _metadatas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "3f5d775e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Markdown[9'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_splits[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "33ab0d5c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'Header 1': 'Intro', 'Header 2': 'History'}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_metadatas[0]"
+    "# Split\n",
+    "splits = text_splitter.split_documents(md_header_splits)\n",
+    "splits"
   ]
  }
 ],