From 080904689ccdba60c6f50b0bffb2032d23fba0b2 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 5 Mar 2024 16:19:37 -0800 Subject: [PATCH] docs: text splitters install (#18589) --- .../HTML_header_metadata.ipynb | 18 +++++++++++++++++- .../character_text_splitter.ipynb | 12 +++++++++++- .../document_transformers/code_splitter.ipynb | 12 +++++++++++- .../document_transformers/index.mdx | 3 +-- .../markdown_header_metadata.ipynb | 12 +++++++++++- .../recursive_json_splitter.ipynb | 16 ++++++++++++---- .../recursive_text_splitter.ipynb | 12 +++++++++++- .../document_transformers/split_by_token.ipynb | 4 ++-- 8 files changed, 76 insertions(+), 13 deletions(-) diff --git a/docs/docs/modules/data_connection/document_transformers/HTML_header_metadata.ipynb b/docs/docs/modules/data_connection/document_transformers/HTML_header_metadata.ipynb index 85bf37b9cac..067d313cf05 100644 --- a/docs/docs/modules/data_connection/document_transformers/HTML_header_metadata.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/HTML_header_metadata.ipynb @@ -4,6 +4,7 @@ "cell_type": "markdown", "id": "c95fcd15cd52c944", "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -17,6 +18,16 @@ "#### 1) With an HTML string:" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e55d44c-1fff-449a-bf52-0d6df488323f", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -26,6 +37,7 @@ "end_time": "2023-10-02T18:57:49.208965400Z", "start_time": "2023-10-02T18:57:48.899756Z" }, + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -93,6 +105,7 @@ "cell_type": "markdown", "id": "e29b4aade2a0070c", "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -110,6 +123,7 @@ "end_time": "2023-10-02T18:57:51.016141300Z", "start_time": "2023-10-02T18:57:50.647495400Z" }, + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -162,6 +176,7 @@ "cell_type": "markdown", "id": "ac0930371d79554a", "metadata": { + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -181,6 +196,7 @@ "end_time": "2023-10-02T19:03:25.943524300Z", "start_time": "2023-10-02T19:03:25.691641Z" }, + "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -227,7 +243,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/character_text_splitter.ipynb b/docs/docs/modules/data_connection/document_transformers/character_text_splitter.ipynb index b4bebd2e01a..ef02299b5cc 100644 --- a/docs/docs/modules/data_connection/document_transformers/character_text_splitter.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/character_text_splitter.ipynb @@ -13,6 +13,16 @@ "2. How the chunk size is measured: by number of characters." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf8698ce-44b2-4944-b9a9-254344b537af", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -138,7 +148,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb b/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb index 04729a409c8..1d91f2877a7 100644 --- a/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb @@ -10,6 +10,16 @@ "CodeTextSplitter allows you to split your code with multiple languages supported. Import enum `Language` and specify the language. \n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e4144de-d925-4d4c-91c3-685ef8baa57c", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -579,7 +589,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/index.mdx b/docs/docs/modules/data_connection/document_transformers/index.mdx index 7064212d17d..d0e5ee20636 100644 --- a/docs/docs/modules/data_connection/document_transformers/index.mdx +++ b/docs/docs/modules/data_connection/document_transformers/index.mdx @@ -7,7 +7,6 @@ Once you've loaded documents, you'll often want to transform them to better suit is you may want to split a long document into smaller chunks that can fit into your model's context window. LangChain has a number of built-in document transformers that make it easy to split, combine, filter, and otherwise manipulate documents. - When you want to deal with long pieces of text, it is necessary to split up that text into chunks. As simple as this sounds, there is a lot of potential complexity here. Ideally, you want to keep the semantically related pieces of text together. What "semantically related" means could depend on the type of text. This notebook showcases several ways to do that. @@ -25,7 +24,7 @@ That means there are two different axes along which you can customize your text ## Types of Text Splitters -LangChain offers many different types of text splitters. Below is a table listing all of them, along with a few characteristics: +LangChain offers many different types of text splitters. These all live in the `langchain-text-splitters` package. Below is a table listing all of them, along with a few characteristics: **Name**: Name of the text splitter diff --git a/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb b/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb index 6f4880379d3..de72ec5317f 100644 --- a/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb @@ -38,6 +38,16 @@ "Let's have a look at some examples below." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cd11819-4d4e-4fc1-aa85-faf69d24db89", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -239,7 +249,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/recursive_json_splitter.ipynb b/docs/docs/modules/data_connection/document_transformers/recursive_json_splitter.ipynb index 6148ddf7386..39cb3d274b2 100644 --- a/docs/docs/modules/data_connection/document_transformers/recursive_json_splitter.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/recursive_json_splitter.ipynb @@ -13,6 +13,16 @@ "2. How the chunk size is measured: by number of characters." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f335e05-e5ae-44cc-899d-749aa9031a58", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -62,9 +72,7 @@ "cell_type": "code", "execution_count": 5, "id": "f941aa56", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "# Recursively split json data - If you need to access/manipulate the smaller json chunks\n", @@ -217,7 +225,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/recursive_text_splitter.ipynb b/docs/docs/modules/data_connection/document_transformers/recursive_text_splitter.ipynb index f57b6bd3aae..1808db78d6a 100644 --- a/docs/docs/modules/data_connection/document_transformers/recursive_text_splitter.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/recursive_text_splitter.ipynb @@ -13,6 +13,16 @@ "2. How the chunk size is measured: by number of characters." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c16167c-1e56-4e11-9b8b-60f93044498e", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-text-splitters" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -119,7 +129,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/docs/modules/data_connection/document_transformers/split_by_token.ipynb b/docs/docs/modules/data_connection/document_transformers/split_by_token.ipynb index 50a5d59ed6c..0d975c14bcc 100644 --- a/docs/docs/modules/data_connection/document_transformers/split_by_token.ipynb +++ b/docs/docs/modules/data_connection/document_transformers/split_by_token.ipynb @@ -33,7 +33,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet tiktoken" + "%pip install --upgrade --quiet langchain-text-splitters tiktoken" ] }, { @@ -620,7 +620,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.1" }, "vscode": { "interpreter": {