mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
code splitter docs (#5480)
Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
parent
470b2822a3
commit
5ce74b5958
@ -33,10 +33,8 @@ For an introduction to the default text splitter and generic functionality see:
|
||||
Usage examples for the text splitters:
|
||||
|
||||
- `Character <./text_splitters/examples/character_text_splitter.html>`_
|
||||
- `LaTeX <./text_splitters/examples/latex.html>`_
|
||||
- `Markdown <./text_splitters/examples/markdown.html>`_
|
||||
- `Code (including HTML, Markdown, Latex, Python, etc) <./text_splitters/examples/code_splitter.html>`_
|
||||
- `NLTK <./text_splitters/examples/nltk.html>`_
|
||||
- `Python code <./text_splitters/examples/python.html>`_
|
||||
- `Recursive Character <./text_splitters/examples/recursive_text_splitter.html>`_
|
||||
- `spaCy <./text_splitters/examples/spacy.html>`_
|
||||
- `tiktoken (OpenAI) <./text_splitters/examples/tiktoken_splitter.html>`_
|
||||
@ -49,10 +47,8 @@ Usage examples for the text splitters:
|
||||
:hidden:
|
||||
|
||||
./text_splitters/examples/character_text_splitter.ipynb
|
||||
./text_splitters/examples/latex.ipynb
|
||||
./text_splitters/examples/markdown.ipynb
|
||||
./text_splitters/examples/code_splitter.ipynb
|
||||
./text_splitters/examples/nltk.ipynb
|
||||
./text_splitters/examples/python.ipynb
|
||||
./text_splitters/examples/recursive_text_splitter.ipynb
|
||||
./text_splitters/examples/spacy.ipynb
|
||||
./text_splitters/examples/tiktoken_splitter.ipynb
|
||||
|
@ -1,7 +1,6 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@ -12,64 +11,94 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import (\n",
|
||||
" CodeTextSplitter,\n",
|
||||
" RecursiveCharacterTextSplitter,\n",
|
||||
" Language,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Choose a language to use"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_splitter = CodeTextSplitter(\n",
|
||||
" language=Language.PYTHON, chunk_size=16, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"js_splitter = CodeTextSplitter(\n",
|
||||
" language=Language.JS, chunk_size=16, chunk_overlap=0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Split the code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='def', metadata={}),\n",
|
||||
" Document(page_content='hello_world():', metadata={}),\n",
|
||||
" Document(page_content='print(\"Hello,', metadata={}),\n",
|
||||
" Document(page_content='World!\")', metadata={}),\n",
|
||||
" Document(page_content='# Call the', metadata={}),\n",
|
||||
" Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='hello_world()', metadata={})]"
|
||||
"['cpp',\n",
|
||||
" 'go',\n",
|
||||
" 'java',\n",
|
||||
" 'js',\n",
|
||||
" 'php',\n",
|
||||
" 'proto',\n",
|
||||
" 'python',\n",
|
||||
" 'rst',\n",
|
||||
" 'ruby',\n",
|
||||
" 'rust',\n",
|
||||
" 'scala',\n",
|
||||
" 'swift',\n",
|
||||
" 'markdown',\n",
|
||||
" 'latex',\n",
|
||||
" 'html']"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Full list of support languages\n",
|
||||
"[e.value for e in Language]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# You can also see the separators used for a given language\n",
|
||||
"RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Python\n",
|
||||
"\n",
|
||||
"Here's an example using the PythonTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='def hello_world():\\n print(\"Hello, World!\")', metadata={}),\n",
|
||||
" Document(page_content='# Call the function\\nhello_world()', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -82,31 +111,34 @@
|
||||
"# Call the function\n",
|
||||
"hello_world()\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"python_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.PYTHON, chunk_size=50, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"python_docs = python_splitter.create_documents([PYTHON_CODE])\n",
|
||||
"python_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## JS\n",
|
||||
"Here's an example using the JS text splitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='helloWorld() {', metadata={}),\n",
|
||||
" Document(page_content='console.log(\"He', metadata={}),\n",
|
||||
" Document(page_content='llo,', metadata={}),\n",
|
||||
" Document(page_content='World!\");', metadata={}),\n",
|
||||
" Document(page_content='}', metadata={}),\n",
|
||||
" Document(page_content='// Call the', metadata={}),\n",
|
||||
" Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='helloWorld();', metadata={})]"
|
||||
"[Document(page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}', metadata={}),\n",
|
||||
" Document(page_content='// Call the function\\nhelloWorld();', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -121,10 +153,234 @@
|
||||
"helloWorld();\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"js_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.JS, chunk_size=60, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"js_docs = js_splitter.create_documents([JS_CODE])\n",
|
||||
"js_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Markdown\n",
|
||||
"\n",
|
||||
"Here's an example using the Markdown text splitter."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"markdown_text = \"\"\"\n",
|
||||
"# 🦜️🔗 LangChain\n",
|
||||
"\n",
|
||||
"⚡ Building applications with LLMs through composability ⚡\n",
|
||||
"\n",
|
||||
"## Quick Install\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"# Hopefully this code block isn't split\n",
|
||||
"pip install langchain\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||
"\"\"\"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='# 🦜️🔗 LangChain', metadata={}),\n",
|
||||
" Document(page_content='⚡ Building applications with LLMs through composability ⚡', metadata={}),\n",
|
||||
" Document(page_content='## Quick Install', metadata={}),\n",
|
||||
" Document(page_content=\"```bash\\n# Hopefully this code block isn't split\", metadata={}),\n",
|
||||
" Document(page_content='pip install langchain', metadata={}),\n",
|
||||
" Document(page_content='```', metadata={}),\n",
|
||||
" Document(page_content='As an open source project in a rapidly developing field, we', metadata={}),\n",
|
||||
" Document(page_content='are extremely open to contributions.', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"md_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"md_docs = md_splitter.create_documents([markdown_text])\n",
|
||||
"md_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Latex\n",
|
||||
"\n",
|
||||
"Here's an example on Latex text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"latex_text = \"\"\"\n",
|
||||
"\\documentclass{article}\n",
|
||||
"\n",
|
||||
"\\begin{document}\n",
|
||||
"\n",
|
||||
"\\maketitle\n",
|
||||
"\n",
|
||||
"\\section{Introduction}\n",
|
||||
"Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n",
|
||||
"\n",
|
||||
"\\subsection{History of LLMs}\n",
|
||||
"The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n",
|
||||
"\n",
|
||||
"\\subsection{Applications of LLMs}\n",
|
||||
"LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n",
|
||||
"\n",
|
||||
"\\end{document}\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', metadata={}),\n",
|
||||
" Document(page_content='\\\\section{Introduction}', metadata={}),\n",
|
||||
" Document(page_content='Large language models (LLMs) are a type of machine learning', metadata={}),\n",
|
||||
" Document(page_content='model that can be trained on vast amounts of text data to', metadata={}),\n",
|
||||
" Document(page_content='generate human-like language. In recent years, LLMs have', metadata={}),\n",
|
||||
" Document(page_content='made significant advances in a variety of natural language', metadata={}),\n",
|
||||
" Document(page_content='processing tasks, including language translation, text', metadata={}),\n",
|
||||
" Document(page_content='generation, and sentiment analysis.', metadata={}),\n",
|
||||
" Document(page_content='\\\\subsection{History of LLMs}', metadata={}),\n",
|
||||
" Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,', metadata={}),\n",
|
||||
" Document(page_content='but they were limited by the amount of data that could be', metadata={}),\n",
|
||||
" Document(page_content='processed and the computational power available at the', metadata={}),\n",
|
||||
" Document(page_content='time. In the past decade, however, advances in hardware and', metadata={}),\n",
|
||||
" Document(page_content='software have made it possible to train LLMs on massive', metadata={}),\n",
|
||||
" Document(page_content='datasets, leading to significant improvements in', metadata={}),\n",
|
||||
" Document(page_content='performance.', metadata={}),\n",
|
||||
" Document(page_content='\\\\subsection{Applications of LLMs}', metadata={}),\n",
|
||||
" Document(page_content='LLMs have many applications in industry, including', metadata={}),\n",
|
||||
" Document(page_content='chatbots, content creation, and virtual assistants. They', metadata={}),\n",
|
||||
" Document(page_content='can also be used in academia for research in linguistics,', metadata={}),\n",
|
||||
" Document(page_content='psychology, and computational linguistics.', metadata={}),\n",
|
||||
" Document(page_content='\\\\end{document}', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"latex_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"latex_docs = latex_splitter.create_documents([latex_text])\n",
|
||||
"latex_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## HTML\n",
|
||||
"\n",
|
||||
"Here's an example using an HTML text splitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"html_text = \"\"\"\n",
|
||||
"<!DOCTYPE html>\n",
|
||||
"<html>\n",
|
||||
" <head>\n",
|
||||
" <title>🦜️🔗 LangChain</title>\n",
|
||||
" <style>\n",
|
||||
" body {\n",
|
||||
" font-family: Arial, sans-serif;\n",
|
||||
" }\n",
|
||||
" h1 {\n",
|
||||
" color: darkblue;\n",
|
||||
" }\n",
|
||||
" </style>\n",
|
||||
" </head>\n",
|
||||
" <body>\n",
|
||||
" <div>\n",
|
||||
" <h1>🦜️🔗 LangChain</h1>\n",
|
||||
" <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
|
||||
" </div>\n",
|
||||
" <div>\n",
|
||||
" As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||
" </div>\n",
|
||||
" </body>\n",
|
||||
"</html>\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='<!DOCTYPE html>\\n<html>\\n <head>', metadata={}),\n",
|
||||
" Document(page_content='<title>🦜️🔗 LangChain</title>\\n <style>', metadata={}),\n",
|
||||
" Document(page_content='body {', metadata={}),\n",
|
||||
" Document(page_content='font-family: Arial, sans-serif;', metadata={}),\n",
|
||||
" Document(page_content='}\\n h1 {', metadata={}),\n",
|
||||
" Document(page_content='color: darkblue;\\n }', metadata={}),\n",
|
||||
" Document(page_content='</style>\\n </head>\\n <body>\\n <div>', metadata={}),\n",
|
||||
" Document(page_content='<h1>🦜️🔗 LangChain</h1>', metadata={}),\n",
|
||||
" Document(page_content='<p>⚡ Building applications with LLMs through', metadata={}),\n",
|
||||
" Document(page_content='composability ⚡</p>', metadata={}),\n",
|
||||
" Document(page_content='</div>\\n <div>', metadata={}),\n",
|
||||
" Document(page_content='As an open source project in a rapidly', metadata={}),\n",
|
||||
" Document(page_content='developing field, we are extremely open to contributions.', metadata={}),\n",
|
||||
" Document(page_content='</div>\\n </body>\\n</html>', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"html_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"html_docs = html_splitter.create_documents([html_text])\n",
|
||||
"html_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -135,7 +391,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -149,9 +405,8 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
|
@ -1,172 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "80f6cd99",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# HTML\n",
|
||||
"\n",
|
||||
">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n",
|
||||
"\n",
|
||||
"`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n",
|
||||
"\n",
|
||||
"1. How the text is split: by list of `HTML` specific separators\n",
|
||||
"2. How the chunk size is measured: by number of characters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "96d64839",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import HtmlTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "cfb0da17",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"html_text = \"\"\"\n",
|
||||
"<!DOCTYPE html>\n",
|
||||
"<html>\n",
|
||||
" <head>\n",
|
||||
" <title>🦜️🔗 LangChain</title>\n",
|
||||
" <style>\n",
|
||||
" body {\n",
|
||||
" font-family: Arial, sans-serif;\n",
|
||||
" }\n",
|
||||
" h1 {\n",
|
||||
" color: darkblue;\n",
|
||||
" }\n",
|
||||
" </style>\n",
|
||||
" </head>\n",
|
||||
" <body>\n",
|
||||
" <div>\n",
|
||||
" <h1>🦜️🔗 LangChain</h1>\n",
|
||||
" <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
|
||||
" </div>\n",
|
||||
" <div>\n",
|
||||
" As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||
" </div>\n",
|
||||
" </body>\n",
|
||||
"</html>\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "d59a4fe8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = html_splitter.create_documents([html_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "cbb2e100",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='<!DOCTYPE html>\\n<html>', metadata={}),\n",
|
||||
" Document(page_content='<title>🦜️🔗 LangChain</title>', metadata={}),\n",
|
||||
" Document(page_content='body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n </style>\\n </head>', metadata={}),\n",
|
||||
" Document(page_content='/style>\\n </head>', metadata={}),\n",
|
||||
" Document(page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>\\n <p>⚡ Building applications with LLMs through composability ⚡</p>\\n </div>', metadata={}),\n",
|
||||
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n </div>\\n </body>\\n</html>', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['<!DOCTYPE html>\\n<html>',\n",
|
||||
" '<title>🦜️🔗 LangChain</title>',\n",
|
||||
" 'body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n </style>\\n </head>',\n",
|
||||
" '/style>\\n </head>',\n",
|
||||
" '<div>\\n <h1>🦜️🔗 LangChain</h1>\\n <p>⚡ Building applications with LLMs through composability ⚡</p>\\n </div>',\n",
|
||||
" 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n </div>\\n </body>\\n</html>']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"html_splitter.split_text(html_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9bee7858-9175-4d99-bd30-68f2dece8601",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,155 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3a2f572e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LaTeX\n",
|
||||
"\n",
|
||||
">[LaTeX](https://en.wikipedia.org/wiki/LaTeX) is widely used in academia for the communication and publication of scientific documents in many fields, including mathematics, computer science, engineering, physics, chemistry, economics, linguistics, quantitative psychology, philosophy, and political science.\n",
|
||||
"\n",
|
||||
"`LatexTextSplitter` splits text along `LaTeX` headings, headlines, enumerations and more. It's implemented as a subclass of `RecursiveCharacterSplitter` with LaTeX-specific separators. See the source code for more details.\n",
|
||||
"\n",
|
||||
"1. How the text is split: by list of `LaTeX` specific tags\n",
|
||||
"2. How the chunk size is measured: by number of characters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c2503917",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import LatexTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e46b753b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"latex_text = \"\"\"\n",
|
||||
"\\documentclass{article}\n",
|
||||
"\n",
|
||||
"\\begin{document}\n",
|
||||
"\n",
|
||||
"\\maketitle\n",
|
||||
"\n",
|
||||
"\\section{Introduction}\n",
|
||||
"Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n",
|
||||
"\n",
|
||||
"\\subsection{History of LLMs}\n",
|
||||
"The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n",
|
||||
"\n",
|
||||
"\\subsection{Applications of LLMs}\n",
|
||||
"LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n",
|
||||
"\n",
|
||||
"\\end{document}\n",
|
||||
"\"\"\"\n",
|
||||
"latex_splitter = LatexTextSplitter(chunk_size=400, chunk_overlap=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "73b5bd33",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = latex_splitter.create_documents([latex_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e1c7fbd5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}', lookup_str='', metadata={}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "40e62829-9485-414e-9ea1-e1a8fc7c88cb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle',\n",
|
||||
" 'Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.',\n",
|
||||
" 'History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.',\n",
|
||||
" 'Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}']"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"latex_splitter.split_text(latex_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7deb8f25-a062-4956-9f90-513802069667",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,153 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "80f6cd99",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Markdown\n",
|
||||
"\n",
|
||||
">[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n",
|
||||
"\n",
|
||||
"`MarkdownTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n",
|
||||
"\n",
|
||||
"1. How the text is split: by list of `markdown` specific separators\n",
|
||||
"2. How the chunk size is measured: by number of characters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "96d64839",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import MarkdownTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cfb0da17",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"markdown_text = \"\"\"\n",
|
||||
"# 🦜️🔗 LangChain\n",
|
||||
"\n",
|
||||
"⚡ Building applications with LLMs through composability ⚡\n",
|
||||
"\n",
|
||||
"## Quick Install\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"# Hopefully this code block isn't split\n",
|
||||
"pip install langchain\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||
"\"\"\"\n",
|
||||
"markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "d59a4fe8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = markdown_splitter.create_documents([markdown_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "cbb2e100",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', metadata={}),\n",
|
||||
" Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", metadata={}),\n",
|
||||
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡',\n",
|
||||
" \"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\",\n",
|
||||
" 'As an open source project in a rapidly developing field, we are extremely open to contributions.']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"markdown_splitter.split_text(markdown_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9bee7858-9175-4d99-bd30-68f2dece8601",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,121 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c350765d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Python Code\n",
|
||||
"\n",
|
||||
"`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.\n",
|
||||
"\n",
|
||||
"1. How the text is split: by list of python specific separators\n",
|
||||
"2. How the chunk size is measured: by number of characters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1703463f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import PythonCodeTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f17a1854",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_text = \"\"\"\n",
|
||||
"class Foo:\n",
|
||||
"\n",
|
||||
" def bar():\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"def foo():\n",
|
||||
"\n",
|
||||
"def testing_func_with_long_name():\n",
|
||||
"\n",
|
||||
"def bar():\n",
|
||||
"\"\"\"\n",
|
||||
"python_splitter = PythonCodeTextSplitter(chunk_size=40, chunk_overlap=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8cc33770",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = python_splitter.create_documents([python_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f5f70775",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='class Foo:\\n\\n def bar():', metadata={}),\n",
|
||||
" Document(page_content='def foo():', metadata={}),\n",
|
||||
" Document(page_content='def testing_func_with_long_name():', metadata={}),\n",
|
||||
" Document(page_content='def bar():', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e096d42",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -293,6 +293,24 @@ class TokenTextSplitter(TextSplitter):
|
||||
return splits
|
||||
|
||||
|
||||
class Language(str, Enum):
|
||||
CPP = "cpp"
|
||||
GO = "go"
|
||||
JAVA = "java"
|
||||
JS = "js"
|
||||
PHP = "php"
|
||||
PROTO = "proto"
|
||||
PYTHON = "python"
|
||||
RST = "rst"
|
||||
RUBY = "ruby"
|
||||
RUST = "rust"
|
||||
SCALA = "scala"
|
||||
SWIFT = "swift"
|
||||
MARKDOWN = "markdown"
|
||||
LATEX = "latex"
|
||||
HTML = "html"
|
||||
|
||||
|
||||
class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at characters.
|
||||
|
||||
@ -350,205 +368,15 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
return self._split_text(text, self._separators)
|
||||
|
||||
@classmethod
|
||||
def from_language(
|
||||
cls, language: Language, **kwargs: Any
|
||||
) -> RecursiveCharacterTextSplitter:
|
||||
separators = cls.get_separators_for_language(language)
|
||||
return cls(separators=separators, **kwargs)
|
||||
|
||||
class NLTKTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using NLTK."""
|
||||
|
||||
def __init__(self, separator: str = "\n\n", **kwargs: Any):
|
||||
"""Initialize the NLTK splitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
self._tokenizer = sent_tokenize
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is not installed, please install it with `pip install nltk`."
|
||||
)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
splits = self._tokenizer(text)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
class SpacyTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using Spacy."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
|
||||
):
|
||||
"""Initialize the spacy text splitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
import spacy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Spacy is not installed, please install it with `pip install spacy`."
|
||||
)
|
||||
self._tokenizer = spacy.load(pipeline)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Markdown-formatted headings."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a MarkdownTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along Markdown headings (starting with level 2)
|
||||
"\n## ",
|
||||
"\n### ",
|
||||
"\n#### ",
|
||||
"\n##### ",
|
||||
"\n###### ",
|
||||
# Note the alternative syntax for headings (below) is not handled here
|
||||
# Heading level 2
|
||||
# ---------------
|
||||
# End of code block
|
||||
"```\n\n",
|
||||
# Horizontal lines
|
||||
"\n\n***\n\n",
|
||||
"\n\n---\n\n",
|
||||
"\n\n___\n\n",
|
||||
# Note that this splitter doesn't handle horizontal lines defined
|
||||
# by *three or more* of ***, ---, or ___, but this is not handled
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class LatexTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Latex-formatted layout elements."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a LatexTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along Latex sections
|
||||
"\n\\chapter{",
|
||||
"\n\\section{",
|
||||
"\n\\subsection{",
|
||||
"\n\\subsubsection{",
|
||||
# Now split by environments
|
||||
"\n\\begin{enumerate}",
|
||||
"\n\\begin{itemize}",
|
||||
"\n\\begin{description}",
|
||||
"\n\\begin{list}",
|
||||
"\n\\begin{quote}",
|
||||
"\n\\begin{quotation}",
|
||||
"\n\\begin{verse}",
|
||||
"\n\\begin{verbatim}",
|
||||
## Now split by math environments
|
||||
"\n\\begin{align}",
|
||||
"$$",
|
||||
"$",
|
||||
# Now split by the normal type of lines
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Python syntax."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a PythonCodeTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along class definitions
|
||||
"\nclass ",
|
||||
"\ndef ",
|
||||
"\n\tdef ",
|
||||
# Now split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class HtmlTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along HTML layout elements."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a HtmlTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along HTML tags
|
||||
"<body>",
|
||||
"<div>",
|
||||
"<p>",
|
||||
"<br>",
|
||||
"<li>",
|
||||
"<h1>",
|
||||
"<h2>",
|
||||
"<h3>",
|
||||
"<h4>",
|
||||
"<h5>",
|
||||
"<h6>",
|
||||
"<span>",
|
||||
"<table>",
|
||||
"<tr>",
|
||||
"<td>",
|
||||
"<th>",
|
||||
"<ul>",
|
||||
"<ol>",
|
||||
"<header>",
|
||||
"<footer>",
|
||||
"<nav>",
|
||||
# Head
|
||||
"<head>",
|
||||
"<style>",
|
||||
"<script>",
|
||||
"<meta>",
|
||||
"<title>",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class Language(str, Enum):
|
||||
CPP = "cpp"
|
||||
GO = "go"
|
||||
JAVA = "java"
|
||||
JS = "js"
|
||||
PHP = "php"
|
||||
PROTO = "proto"
|
||||
PYTHON = "python"
|
||||
RST = "rst"
|
||||
RUBY = "ruby"
|
||||
RUST = "rust"
|
||||
SCALA = "scala"
|
||||
SWIFT = "swift"
|
||||
MARKDOWN = "markdown"
|
||||
LATEX = "latex"
|
||||
|
||||
|
||||
class CodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
def __init__(self, language: Language, **kwargs: Any):
|
||||
"""
|
||||
A generic code text splitter supporting many programming languages.
|
||||
Example:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JAVA
|
||||
)
|
||||
Args:
|
||||
Language: The programming language to use
|
||||
"""
|
||||
separators = self._get_separators_for_language(language)
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
def _get_separators_for_language(self, language: Language) -> List[str]:
|
||||
@staticmethod
|
||||
def get_separators_for_language(language: Language) -> List[str]:
|
||||
if language == Language.CPP:
|
||||
return [
|
||||
# Split along class definitions
|
||||
@ -821,8 +649,114 @@ class CodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.HTML:
|
||||
return [
|
||||
# First, try to split along HTML tags
|
||||
"<body>",
|
||||
"<div>",
|
||||
"<p>",
|
||||
"<br>",
|
||||
"<li>",
|
||||
"<h1>",
|
||||
"<h2>",
|
||||
"<h3>",
|
||||
"<h4>",
|
||||
"<h5>",
|
||||
"<h6>",
|
||||
"<span>",
|
||||
"<table>",
|
||||
"<tr>",
|
||||
"<td>",
|
||||
"<th>",
|
||||
"<ul>",
|
||||
"<ol>",
|
||||
"<header>",
|
||||
"<footer>",
|
||||
"<nav>",
|
||||
# Head
|
||||
"<head>",
|
||||
"<style>",
|
||||
"<script>",
|
||||
"<meta>",
|
||||
"<title>",
|
||||
"",
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Language {language} is not supported! "
|
||||
f"Please choose from {list(Language)}"
|
||||
)
|
||||
|
||||
|
||||
class NLTKTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using NLTK."""
|
||||
|
||||
def __init__(self, separator: str = "\n\n", **kwargs: Any):
|
||||
"""Initialize the NLTK splitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
self._tokenizer = sent_tokenize
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"NLTK is not installed, please install it with `pip install nltk`."
|
||||
)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
# First we naively split the large input into a bunch of smaller ones.
|
||||
splits = self._tokenizer(text)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
class SpacyTextSplitter(TextSplitter):
|
||||
"""Implementation of splitting text that looks at sentences using Spacy."""
|
||||
|
||||
def __init__(
|
||||
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
|
||||
):
|
||||
"""Initialize the spacy text splitter."""
|
||||
super().__init__(**kwargs)
|
||||
try:
|
||||
import spacy
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Spacy is not installed, please install it with `pip install spacy`."
|
||||
)
|
||||
self._tokenizer = spacy.load(pipeline)
|
||||
self._separator = separator
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
# For backwards compatibility
|
||||
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Python syntax."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a PythonCodeTextSplitter."""
|
||||
seperators = self.get_separators_for_language(Language.PYTHON)
|
||||
super().__init__(separators=seperators, **kwargs)
|
||||
|
||||
|
||||
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Markdown-formatted headings."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a MarkdownTextSplitter."""
|
||||
seperators = self.get_separators_for_language(Language.MARKDOWN)
|
||||
super().__init__(separators=seperators, **kwargs)
|
||||
|
||||
|
||||
class LatexTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Latex-formatted layout elements."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a LatexTextSplitter."""
|
||||
seperators = self.get_separators_for_language(Language.LATEX)
|
||||
super().__init__(separators=seperators, **kwargs)
|
||||
|
@ -4,7 +4,6 @@ import pytest
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import (
|
||||
CharacterTextSplitter,
|
||||
CodeTextSplitter,
|
||||
Language,
|
||||
PythonCodeTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
@ -202,8 +201,8 @@ CHUNK_SIZE = 16
|
||||
|
||||
|
||||
def test_python_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
def hello_world():
|
||||
@ -225,8 +224,8 @@ hello_world()
|
||||
|
||||
|
||||
def test_golang_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
package main
|
||||
@ -258,8 +257,8 @@ func main() {
|
||||
|
||||
|
||||
def test_rst_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
Sample Document
|
||||
@ -294,8 +293,8 @@ Lists
|
||||
|
||||
|
||||
def test_proto_file_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
syntax = "proto3";
|
||||
@ -328,8 +327,8 @@ message Person {
|
||||
|
||||
|
||||
def test_javascript_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
function helloWorld() {
|
||||
@ -354,8 +353,8 @@ helloWorld();
|
||||
|
||||
|
||||
def test_java_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
public class HelloWorld {
|
||||
@ -380,8 +379,8 @@ public class HelloWorld {
|
||||
|
||||
|
||||
def test_cpp_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
#include <iostream>
|
||||
@ -405,8 +404,8 @@ int main() {
|
||||
|
||||
|
||||
def test_scala_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
object HelloWorld {
|
||||
@ -430,8 +429,8 @@ object HelloWorld {
|
||||
|
||||
|
||||
def test_ruby_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
def hello_world
|
||||
@ -451,8 +450,8 @@ hello_world
|
||||
|
||||
|
||||
def test_php_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
<?php
|
||||
@ -478,8 +477,8 @@ hello_world();
|
||||
|
||||
|
||||
def test_swift_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
func helloWorld() {
|
||||
@ -500,8 +499,8 @@ helloWorld()
|
||||
|
||||
|
||||
def test_rust_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
fn main() {
|
||||
|
Loading…
Reference in New Issue
Block a user