mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-07 14:03:26 +00:00
MD header text splitter returns Documents (#6571)
Return `Documents` from MD header text splitter to simplify UX. Updates the test as well as example notebooks.
This commit is contained in:
@@ -50,8 +50,8 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "19c044f0",
|
||||
"execution_count": 2,
|
||||
"id": "ceb3c1fb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -65,13 +65,16 @@
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||
]
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
|
||||
" Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
|
||||
" Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -85,8 +88,28 @@
|
||||
"\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"for split in md_header_splits:\n",
|
||||
" print(split)"
|
||||
"md_header_splits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "aac1738c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"langchain.schema.Document"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"type(md_header_splits[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -99,10 +122,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"id": "480e0e3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
|
||||
"\n",
|
||||
@@ -117,60 +155,13 @@
|
||||
"\n",
|
||||
"# Char-level splits\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"chunk_size = 10\n",
|
||||
"chunk_overlap = 0\n",
|
||||
"chunk_size = 250\n",
|
||||
"chunk_overlap = 30\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
"\n",
|
||||
"# Split within each header group\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[] \n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3f5d775e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Markdown[9'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "33ab0d5c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Header 1': 'Intro', 'Header 2': 'History'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[0]"
|
||||
"# Split\n",
|
||||
"splits = text_splitter.split_documents(md_header_splits)\n",
|
||||
"splits"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@@ -9,11 +9,11 @@
|
||||
"\n",
|
||||
"Text splitting for vector storage often uses sentences or other delimiters [to keep related text together](https://www.pinecone.io/learn/chunking-strategies/). \n",
|
||||
"\n",
|
||||
"But many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting. \n",
|
||||
"But many documents (such as `Markdown` files) have structure (headers) that can be explicitly used in splitting. \n",
|
||||
"\n",
|
||||
"We added a new text splitter for Markdown files that lets a user split based specified headers. \n",
|
||||
"The `MarkdownHeaderTextSplitter` lets a user split `Markdown` files files based on specified headers. \n",
|
||||
"\n",
|
||||
"This results in chunks that retain the header(s) that it came from (e.g., Introduction) in the chunk metadata.\n",
|
||||
"This results in chunks that retain the header(s) that it came from in the metadata.\n",
|
||||
"\n",
|
||||
"This works nicely w/ `SelfQueryRetriever`.\n",
|
||||
"\n",
|
||||
@@ -30,19 +30,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "cda52c2c",
|
||||
"execution_count": null,
|
||||
"id": "2e587f65",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/31treehaus/miniconda3/envs/langchain-new/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load Notion page as a markdownfile file\n",
|
||||
"from langchain.document_loaders import NotionDirectoryLoader\n",
|
||||
@@ -54,22 +45,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "730b84f2",
|
||||
"execution_count": null,
|
||||
"id": "1cd3fd7e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n",
|
||||
" 'metadata': {'Section': 'Evaluation'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's create groups based on the section headers in our page\n",
|
||||
"from langchain.text_splitter import MarkdownHeaderTextSplitter\n",
|
||||
@@ -77,8 +56,7 @@
|
||||
" (\"###\", \"Section\"),\n",
|
||||
"]\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(md_file)\n",
|
||||
"md_header_splits[3]"
|
||||
"md_header_splits = markdown_splitter.split_text(md_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -86,7 +64,7 @@
|
||||
"id": "4f73a609",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, we split the text in each header group and keep the group as metadata."
|
||||
"Now, perform text splitting on the header grouped documents. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -101,57 +79,7 @@
|
||||
"chunk_size = 500\n",
|
||||
"chunk_overlap = 0\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
" \n",
|
||||
"# Create splits within each header group and combine them\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[]\n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "7424f78b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[6]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "08f5db3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Section': 'Motivation'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[6]"
|
||||
"all_splits = text_splitter.split_documents(md_header_splits)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -183,7 +111,7 @@
|
||||
"source": [
|
||||
"# Build vectorstore and keep the metadata\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
|
||||
"vectorstore = Chroma.from_documents(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
Reference in New Issue
Block a user