From 809a9f485f577499c67386697d3efaf167416cf6 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 16 Dec 2022 07:42:31 -0800 Subject: [PATCH] Harrison/new version (#362) --- docs/examples/integrations/textsplitter.ipynb | 60 ++++++++++++++++--- langchain/llms/openai.py | 4 ++ pyproject.toml | 2 +- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/docs/examples/integrations/textsplitter.ipynb b/docs/examples/integrations/textsplitter.ipynb index 5f57d110c26..3103b4c6d26 100644 --- a/docs/examples/integrations/textsplitter.ipynb +++ b/docs/examples/integrations/textsplitter.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "e82c4685", "metadata": {}, "outputs": [], @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "79ff6737", "metadata": {}, "outputs": [], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "38547666", "metadata": {}, "outputs": [ @@ -67,7 +67,7 @@ "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \\n\\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. '" ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "a8ce51d5", "metadata": {}, "outputs": [ @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "id": "ca5e72c0", "metadata": {}, "outputs": [], @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "37cdfbeb", "metadata": {}, "outputs": [ @@ -143,6 +143,52 @@ "print(texts[0])" ] }, + { + "cell_type": "markdown", + "id": "7683b36a", + "metadata": {}, + "source": [ + "## tiktoken (OpenAI) Length Function\n", + "You can also use tiktoken, a open source tokenizer package from OpenAI to estimate tokens used. Will probably be ore accurate for their models." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "825f7c0a", + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)\n", + "texts = text_splitter.split_text(state_of_the_union)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ae35d165", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n", + "\n", + "Last year COVID-19 kept us apart. This year we are finally together again. \n", + "\n", + "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n", + "\n", + "With a duty to one another to the American people to the Constitution. \n", + "\n", + "And with an unwavering resolve that freedom will always triumph over tyranny. \n" + ] + } + ], + "source": [ + "print(texts[0])" + ] + }, { "cell_type": "markdown", "id": "ea2973ac", diff --git a/langchain/llms/openai.py b/langchain/llms/openai.py index c639db63e1e..2b02059514f 100644 --- a/langchain/llms/openai.py +++ b/langchain/llms/openai.py @@ -1,4 +1,5 @@ """Wrapper around OpenAI APIs.""" +import sys from typing import Any, Dict, List, Mapping, Optional from pydantic import BaseModel, Extra, Field, root_validator @@ -188,6 +189,9 @@ class OpenAI(LLM, BaseModel): def get_num_tokens(self, text: str) -> int: """Calculate num tokens with tiktoken package.""" + # tiktoken NOT supported for Python 3.8 or below + if sys.version_info[1] <= 8: + return super().get_num_tokens(text) try: import tiktoken except ImportError: diff --git a/pyproject.toml b/pyproject.toml index 885e78d7079..be4705dcf4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.38" +version = "0.0.39" description = "Building applications with LLMs through composability" authors = [] license = "MIT"