mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
docs: text splitters install (#18589)
This commit is contained in:
parent
dc81dba6cf
commit
080904689c
@ -4,6 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "c95fcd15cd52c944",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -17,6 +18,16 @@
|
||||
"#### 1) With an HTML string:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2e55d44c-1fff-449a-bf52-0d6df488323f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -26,6 +37,7 @@
|
||||
"end_time": "2023-10-02T18:57:49.208965400Z",
|
||||
"start_time": "2023-10-02T18:57:48.899756Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -93,6 +105,7 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "e29b4aade2a0070c",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -110,6 +123,7 @@
|
||||
"end_time": "2023-10-02T18:57:51.016141300Z",
|
||||
"start_time": "2023-10-02T18:57:50.647495400Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -162,6 +176,7 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "ac0930371d79554a",
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -181,6 +196,7 @@
|
||||
"end_time": "2023-10-02T19:03:25.943524300Z",
|
||||
"start_time": "2023-10-02T19:03:25.691641Z"
|
||||
},
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
@ -227,7 +243,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -13,6 +13,16 @@
|
||||
"2. How the chunk size is measured: by number of characters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bf8698ce-44b2-4944-b9a9-254344b537af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
@ -138,7 +148,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -10,6 +10,16 @@
|
||||
"CodeTextSplitter allows you to split your code with multiple languages supported. Import enum `Language` and specify the language. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e4144de-d925-4d4c-91c3-685ef8baa57c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -579,7 +589,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -7,7 +7,6 @@ Once you've loaded documents, you'll often want to transform them to better suit
|
||||
is you may want to split a long document into smaller chunks that can fit into your model's context window. LangChain
|
||||
has a number of built-in document transformers that make it easy to split, combine, filter, and otherwise manipulate documents.
|
||||
|
||||
|
||||
When you want to deal with long pieces of text, it is necessary to split up that text into chunks.
|
||||
As simple as this sounds, there is a lot of potential complexity here. Ideally, you want to keep the semantically related pieces of text together. What "semantically related" means could depend on the type of text.
|
||||
This notebook showcases several ways to do that.
|
||||
@ -25,7 +24,7 @@ That means there are two different axes along which you can customize your text
|
||||
|
||||
## Types of Text Splitters
|
||||
|
||||
LangChain offers many different types of text splitters. Below is a table listing all of them, along with a few characteristics:
|
||||
LangChain offers many different types of text splitters. These all live in the `langchain-text-splitters` package. Below is a table listing all of them, along with a few characteristics:
|
||||
|
||||
**Name**: Name of the text splitter
|
||||
|
||||
|
@ -38,6 +38,16 @@
|
||||
"Let's have a look at some examples below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0cd11819-4d4e-4fc1-aa85-faf69d24db89",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -239,7 +249,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -13,6 +13,16 @@
|
||||
"2. How the chunk size is measured: by number of characters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3f335e05-e5ae-44cc-899d-749aa9031a58",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -62,9 +72,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f941aa56",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Recursively split json data - If you need to access/manipulate the smaller json chunks\n",
|
||||
@ -217,7 +225,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -13,6 +13,16 @@
|
||||
"2. How the chunk size is measured: by number of characters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c16167c-1e56-4e11-9b8b-60f93044498e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-text-splitters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -119,7 +129,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.1"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -33,7 +33,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet tiktoken"
|
||||
"%pip install --upgrade --quiet langchain-text-splitters tiktoken"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -620,7 +620,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.9.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
Loading…
Reference in New Issue
Block a user