mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 19:03:25 +00:00
Add tests and update notebook for MarkdownHeaderTextSplitter (#6069)
Add test and update notebook for `MarkdownHeaderTextSplitter`.
This commit is contained in:
parent
8fdf88b8e3
commit
ee3d0513ad
@ -7,48 +7,30 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# MarkdownHeaderTextSplitter\n",
|
"# MarkdownHeaderTextSplitter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The objective is to split a markdown file by a specified set of headers.\n",
|
"This splits a markdown file by a specified set of headers. For example, if we want to split this markdown:\n",
|
||||||
" \n",
|
|
||||||
"**Given this example:**\n",
|
|
||||||
"\n",
|
|
||||||
"# Foo\n",
|
|
||||||
"\n",
|
|
||||||
"## Bar\n",
|
|
||||||
"\n",
|
|
||||||
"Hi this is Jim \n",
|
|
||||||
"Hi this is Joe\n",
|
|
||||||
"\n",
|
|
||||||
"## Baz\n",
|
|
||||||
"\n",
|
|
||||||
"Hi this is Molly\n",
|
|
||||||
" \n",
|
|
||||||
"**Written as:**\n",
|
|
||||||
"\n",
|
|
||||||
"```\n",
|
"```\n",
|
||||||
"md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
"md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**If we want to split on specified headers:**\n",
|
"Headers to split on:\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n",
|
"[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Then we expect:** \n",
|
"Expected output:\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n",
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n",
|
||||||
"```\n",
|
"```\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Options:**\n",
|
"Optionally, this also includes `return_each_line` in case a user want to perform other types of aggregation. \n",
|
||||||
"\n",
|
"\n",
|
||||||
"This also includes `return_each_line` in case a user want to perform other types of aggregation. \n",
|
"If `return_each_line=True`, each line and associated header metadata are simply returned. "
|
||||||
"\n",
|
|
||||||
"If `return_each_line=True`, each line and associated header metadata are returned. "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 2,
|
||||||
"id": "19c044f0",
|
"id": "19c044f0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -56,117 +38,10 @@
|
|||||||
"from langchain.text_splitter import MarkdownHeaderTextSplitter"
|
"from langchain.text_splitter import MarkdownHeaderTextSplitter"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "ec8d8053",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"`Test case 1`"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 9,
|
||||||
"id": "5cd0a66c",
|
"id": "2ae3649b",
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Doc\n",
|
|
||||||
"markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
|
||||||
" \n",
|
|
||||||
"# Test case 1\n",
|
|
||||||
"headers_to_split_on = [\n",
|
|
||||||
" (\"#\", \"Header 1\"),\n",
|
|
||||||
" (\"##\", \"Header 2\"),\n",
|
|
||||||
"]\n",
|
|
||||||
"\n",
|
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n",
|
|
||||||
"\n",
|
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
|
||||||
"for chunk in chunked_docs:\n",
|
|
||||||
" print(chunk)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "67d25a1c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
|
||||||
"for chunk in chunked_docs:\n",
|
|
||||||
" print(chunk)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "f1f74dfa",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"`Test case 2`"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "2183c96a",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'content': 'Text under H3.', 'metadata': {'Header 1': 'H1', 'Header 2': 'H2', 'Header 3': 'H3'}}\n",
|
|
||||||
"{'content': 'Text under H2_2.', 'metadata': {'Header 1': 'H1_2', 'Header 2': 'H2_2'}}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"headers_to_split_on = [\n",
|
|
||||||
" (\"#\", \"Header 1\"),\n",
|
|
||||||
" (\"##\", \"Header 2\"),\n",
|
|
||||||
" (\"###\", \"Header 3\"),\n",
|
|
||||||
"]\n",
|
|
||||||
"markdown_document = '# H1\\n\\n## H2\\n\\n### H3\\n\\nText under H3.\\n\\n# H1_2\\n\\n## H2_2\\n\\nText under H2_2.'\n",
|
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
|
||||||
"for chunk in chunked_docs:\n",
|
|
||||||
" print(chunk)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "add24254",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"`Test case 3`"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "c3f4690f",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -187,88 +62,24 @@
|
|||||||
" (\"##\", \"Header 2\"),\n",
|
" (\"##\", \"Header 2\"),\n",
|
||||||
" (\"###\", \"Header 3\"),\n",
|
" (\"###\", \"Header 3\"),\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
"\n",
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||||
"for chunk in chunked_docs:\n",
|
"splits = markdown_splitter.split_text(markdown_document)\n",
|
||||||
" print(chunk)"
|
"for split in splits:\n",
|
||||||
]
|
" print(split)"
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "20907fb7",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
|
||||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n",
|
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
|
||||||
"for chunk in chunked_docs:\n",
|
|
||||||
" print(chunk)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "9c448431",
|
"id": "2a32026a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"`Test case 4`"
|
"Here's an example on a larger file with `return_each_line=True` passed, allowing each line to be examined."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 10,
|
||||||
"id": "9858ea51",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
|
||||||
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
|
||||||
"{'content': 'Hi this is John', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo', 'Header 4': 'Bim'}}\n",
|
|
||||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n #### Bim \\n\\n Hi this is John \\n\\n ## Baz\\n\\n Hi this is Molly'\n",
|
|
||||||
" \n",
|
|
||||||
"headers_to_split_on = [\n",
|
|
||||||
" (\"#\", \"Header 1\"),\n",
|
|
||||||
" (\"##\", \"Header 2\"),\n",
|
|
||||||
" (\"###\", \"Header 3\"),\n",
|
|
||||||
" (\"####\", \"Header 4\"),\n",
|
|
||||||
"]\n",
|
|
||||||
" \n",
|
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
|
||||||
"for chunk in chunked_docs:\n",
|
|
||||||
" print(chunk)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "bba6eb9e",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"`Test case 5`"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "8af8f9a2",
|
"id": "8af8f9a2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -276,8 +87,10 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
"{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
||||||
"{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
"{'content': 'Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
||||||
|
"{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
||||||
|
"{'content': 'additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
||||||
"{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n",
|
"{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n",
|
||||||
"{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n"
|
"{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n"
|
||||||
]
|
]
|
||||||
@ -293,11 +106,19 @@
|
|||||||
" (\"####\", \"Header 4\"),\n",
|
" (\"####\", \"Header 4\"),\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
" \n",
|
" \n",
|
||||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n",
|
||||||
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
"splits = markdown_splitter.split_text(markdown_document)\n",
|
||||||
"for chunk in chunked_docs:\n",
|
"for line in splits:\n",
|
||||||
" print(chunk)"
|
" print(line)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "987183f2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -7,6 +7,7 @@ from langchain.docstore.document import Document
|
|||||||
from langchain.text_splitter import (
|
from langchain.text_splitter import (
|
||||||
CharacterTextSplitter,
|
CharacterTextSplitter,
|
||||||
Language,
|
Language,
|
||||||
|
MarkdownHeaderTextSplitter,
|
||||||
PythonCodeTextSplitter,
|
PythonCodeTextSplitter,
|
||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
)
|
)
|
||||||
@ -671,3 +672,129 @@ def test_html_code_splitter() -> None:
|
|||||||
"<p>Some text</p>",
|
"<p>Some text</p>",
|
||||||
"<p>Some more text</p>\n </div>",
|
"<p>Some more text</p>\n </div>",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_1() -> None:
|
||||||
|
"""Test markdown splitter by header: Case 1."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# Foo\n\n"
|
||||||
|
" ## Bar\n\n"
|
||||||
|
"Hi this is Jim\n\n"
|
||||||
|
"Hi this is Joe\n\n"
|
||||||
|
" ## Baz\n\n"
|
||||||
|
" Hi this is Molly"
|
||||||
|
)
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
]
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
expected_output = [
|
||||||
|
{
|
||||||
|
"content": "Hi this is Jim \nHi this is Joe",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is Molly",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_2() -> None:
|
||||||
|
"""Test markdown splitter by header: Case 2."""
|
||||||
|
markdown_document = (
|
||||||
|
"# Foo\n\n"
|
||||||
|
" ## Bar\n\n"
|
||||||
|
"Hi this is Jim\n\n"
|
||||||
|
"Hi this is Joe\n\n"
|
||||||
|
" ### Boo \n\n"
|
||||||
|
" Hi this is Lance \n\n"
|
||||||
|
" ## Baz\n\n"
|
||||||
|
" Hi this is Molly"
|
||||||
|
)
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
("###", "Header 3"),
|
||||||
|
]
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
expected_output = [
|
||||||
|
{
|
||||||
|
"content": "Hi this is Jim \nHi this is Joe",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is Lance",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is Molly",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_md_header_text_splitter_3() -> None:
|
||||||
|
"""Test markdown splitter by header: Case 3."""
|
||||||
|
|
||||||
|
markdown_document = (
|
||||||
|
"# Foo\n\n"
|
||||||
|
" ## Bar\n\n"
|
||||||
|
"Hi this is Jim\n\n"
|
||||||
|
"Hi this is Joe\n\n"
|
||||||
|
" ### Boo \n\n"
|
||||||
|
" Hi this is Lance \n\n"
|
||||||
|
" #### Bim \n\n"
|
||||||
|
" Hi this is John \n\n"
|
||||||
|
" ## Baz\n\n"
|
||||||
|
" Hi this is Molly"
|
||||||
|
)
|
||||||
|
|
||||||
|
headers_to_split_on = [
|
||||||
|
("#", "Header 1"),
|
||||||
|
("##", "Header 2"),
|
||||||
|
("###", "Header 3"),
|
||||||
|
("####", "Header 4"),
|
||||||
|
]
|
||||||
|
|
||||||
|
markdown_splitter = MarkdownHeaderTextSplitter(
|
||||||
|
headers_to_split_on=headers_to_split_on,
|
||||||
|
)
|
||||||
|
output = markdown_splitter.split_text(markdown_document)
|
||||||
|
|
||||||
|
expected_output = [
|
||||||
|
{
|
||||||
|
"content": "Hi this is Jim \nHi this is Joe",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is Lance",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is John",
|
||||||
|
"metadata": {
|
||||||
|
"Header 1": "Foo",
|
||||||
|
"Header 2": "Bar",
|
||||||
|
"Header 3": "Boo",
|
||||||
|
"Header 4": "Bim",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "Hi this is Molly",
|
||||||
|
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
assert output == expected_output
|
||||||
|
Loading…
Reference in New Issue
Block a user