mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
docs: text splitters install (#18589)
This commit is contained in:
parent
dc81dba6cf
commit
080904689c
@ -4,6 +4,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "c95fcd15cd52c944",
|
"id": "c95fcd15cd52c944",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -17,6 +18,16 @@
|
|||||||
"#### 1) With an HTML string:"
|
"#### 1) With an HTML string:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2e55d44c-1fff-449a-bf52-0d6df488323f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@ -26,6 +37,7 @@
|
|||||||
"end_time": "2023-10-02T18:57:49.208965400Z",
|
"end_time": "2023-10-02T18:57:49.208965400Z",
|
||||||
"start_time": "2023-10-02T18:57:48.899756Z"
|
"start_time": "2023-10-02T18:57:48.899756Z"
|
||||||
},
|
},
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -93,6 +105,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "e29b4aade2a0070c",
|
"id": "e29b4aade2a0070c",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -110,6 +123,7 @@
|
|||||||
"end_time": "2023-10-02T18:57:51.016141300Z",
|
"end_time": "2023-10-02T18:57:51.016141300Z",
|
||||||
"start_time": "2023-10-02T18:57:50.647495400Z"
|
"start_time": "2023-10-02T18:57:50.647495400Z"
|
||||||
},
|
},
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -162,6 +176,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "ac0930371d79554a",
|
"id": "ac0930371d79554a",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -181,6 +196,7 @@
|
|||||||
"end_time": "2023-10-02T19:03:25.943524300Z",
|
"end_time": "2023-10-02T19:03:25.943524300Z",
|
||||||
"start_time": "2023-10-02T19:03:25.691641Z"
|
"start_time": "2023-10-02T19:03:25.691641Z"
|
||||||
},
|
},
|
||||||
|
"collapsed": false,
|
||||||
"jupyter": {
|
"jupyter": {
|
||||||
"outputs_hidden": false
|
"outputs_hidden": false
|
||||||
}
|
}
|
||||||
@ -227,7 +243,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -13,6 +13,16 @@
|
|||||||
"2. How the chunk size is measured: by number of characters."
|
"2. How the chunk size is measured: by number of characters."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bf8698ce-44b2-4944-b9a9-254344b537af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
@ -138,7 +148,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -10,6 +10,16 @@
|
|||||||
"CodeTextSplitter allows you to split your code with multiple languages supported. Import enum `Language` and specify the language. \n"
|
"CodeTextSplitter allows you to split your code with multiple languages supported. Import enum `Language` and specify the language. \n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9e4144de-d925-4d4c-91c3-685ef8baa57c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@ -579,7 +589,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -7,7 +7,6 @@ Once you've loaded documents, you'll often want to transform them to better suit
|
|||||||
is you may want to split a long document into smaller chunks that can fit into your model's context window. LangChain
|
is you may want to split a long document into smaller chunks that can fit into your model's context window. LangChain
|
||||||
has a number of built-in document transformers that make it easy to split, combine, filter, and otherwise manipulate documents.
|
has a number of built-in document transformers that make it easy to split, combine, filter, and otherwise manipulate documents.
|
||||||
|
|
||||||
|
|
||||||
When you want to deal with long pieces of text, it is necessary to split up that text into chunks.
|
When you want to deal with long pieces of text, it is necessary to split up that text into chunks.
|
||||||
As simple as this sounds, there is a lot of potential complexity here. Ideally, you want to keep the semantically related pieces of text together. What "semantically related" means could depend on the type of text.
|
As simple as this sounds, there is a lot of potential complexity here. Ideally, you want to keep the semantically related pieces of text together. What "semantically related" means could depend on the type of text.
|
||||||
This notebook showcases several ways to do that.
|
This notebook showcases several ways to do that.
|
||||||
@ -25,7 +24,7 @@ That means there are two different axes along which you can customize your text
|
|||||||
|
|
||||||
## Types of Text Splitters
|
## Types of Text Splitters
|
||||||
|
|
||||||
LangChain offers many different types of text splitters. Below is a table listing all of them, along with a few characteristics:
|
LangChain offers many different types of text splitters. These all live in the `langchain-text-splitters` package. Below is a table listing all of them, along with a few characteristics:
|
||||||
|
|
||||||
**Name**: Name of the text splitter
|
**Name**: Name of the text splitter
|
||||||
|
|
||||||
|
@ -38,6 +38,16 @@
|
|||||||
"Let's have a look at some examples below."
|
"Let's have a look at some examples below."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0cd11819-4d4e-4fc1-aa85-faf69d24db89",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@ -239,7 +249,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -13,6 +13,16 @@
|
|||||||
"2. How the chunk size is measured: by number of characters."
|
"2. How the chunk size is measured: by number of characters."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3f335e05-e5ae-44cc-899d-749aa9031a58",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@ -62,9 +72,7 @@
|
|||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 5,
|
||||||
"id": "f941aa56",
|
"id": "f941aa56",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"scrolled": false
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Recursively split json data - If you need to access/manipulate the smaller json chunks\n",
|
"# Recursively split json data - If you need to access/manipulate the smaller json chunks\n",
|
||||||
@ -217,7 +225,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -13,6 +13,16 @@
|
|||||||
"2. How the chunk size is measured: by number of characters."
|
"2. How the chunk size is measured: by number of characters."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9c16167c-1e56-4e11-9b8b-60f93044498e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install -qU langchain-text-splitters"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@ -119,7 +129,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.1"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install --upgrade --quiet tiktoken"
|
"%pip install --upgrade --quiet langchain-text-splitters tiktoken"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -620,7 +620,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.12"
|
"version": "3.9.1"
|
||||||
},
|
},
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
|
Loading…
Reference in New Issue
Block a user