From 61e876aad8353891a3d68ae32286e30d82ccb1ae Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:16:04 -0800 Subject: [PATCH] openai[patch]: Explicitly support embedding dimensions (#16596) --- .../integrations/text_embedding/openai.ipynb | 300 ++++++++++-------- .../langchain_community/llms/openai.py | 1 + .../langchain_openai/chat_models/azure.py | 2 + .../langchain_openai/embeddings/azure.py | 14 +- .../langchain_openai/embeddings/base.py | 41 +-- .../openai/langchain_openai/llms/azure.py | 3 +- libs/partners/openai/poetry.lock | 8 +- libs/partners/openai/pyproject.toml | 2 +- .../integration_tests/embeddings/test_base.py | 13 +- 9 files changed, 208 insertions(+), 176 deletions(-) diff --git a/docs/docs/integrations/text_embedding/openai.ipynb b/docs/docs/integrations/text_embedding/openai.ipynb index d44de992a62..effb05cd99a 100644 --- a/docs/docs/integrations/text_embedding/openai.ipynb +++ b/docs/docs/integrations/text_embedding/openai.ipynb @@ -10,9 +10,42 @@ "Let's load the OpenAI Embedding class." ] }, + { + "cell_type": "markdown", + "id": "40ff98ff-58e9-4716-8788-227a5c3f473d", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "First we install langchain-openai and set the required env vars" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, + "id": "c66c4613-6c67-40ca-b3b1-c026750d1742", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62e3710e-55a0-44fb-ba51-2f1d520dfc38", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, "id": "0be1af71", "metadata": {}, "outputs": [], @@ -22,17 +55,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 5, "id": "2c66e5da", "metadata": {}, "outputs": [], "source": [ - "embeddings = OpenAIEmbeddings()" + "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 6, "id": "01370375", "metadata": {}, "outputs": [], @@ -40,33 +73,50 @@ "text = \"This is a test document.\"" ] }, + { + "cell_type": "markdown", + "id": "f012c222-3fa9-470a-935c-758b2048d9af", + "metadata": {}, + "source": [ + "## Usage\n", + "### Embed query" + ] + }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 7, "id": "bfb6142c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: model not found. Using cl100k_base encoding.\n" + ] + } + ], "source": [ "query_result = embeddings.embed_query(text)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 8, "id": "91bc875d-829b-4c3d-8e6f-fc2dda30a3bd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[-0.003186025367556387,\n", - " 0.011071979803637493,\n", - " -0.004020420763285827,\n", - " -0.011658221276953042,\n", - " -0.0010534035786864363]" + "[-0.014380056377383358,\n", + " -0.027191711627651764,\n", + " -0.020042716111860304,\n", + " 0.057301379620345545,\n", + " -0.022267658631828974]" ] }, - "execution_count": 32, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -75,33 +125,49 @@ "query_result[:5]" ] }, + { + "cell_type": "markdown", + "id": "6b733391-1e23-438b-a6bc-0d77eed9426e", + "metadata": {}, + "source": [ + "## Embed documents" + ] + }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 9, "id": "0356c3b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: model not found. Using cl100k_base encoding.\n" + ] + } + ], "source": [ "doc_result = embeddings.embed_documents([text])" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 10, "id": "a4b0d49e-0c73-44b6-aed5-5b426564e085", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[-0.003186025367556387,\n", - " 0.011071979803637493,\n", - " -0.004020420763285827,\n", - " -0.011658221276953042,\n", - " -0.0010534035786864363]" + "[-0.014380056377383358,\n", + " -0.027191711627651764,\n", + " -0.020042716111860304,\n", + " 0.057301379620345545,\n", + " -0.022267658631828974]" ] }, - "execution_count": 34, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -112,131 +178,87 @@ }, { "cell_type": "markdown", - "id": "bb61bbeb", + "id": "e7dc464a-6fa2-4cff-ab2e-49a0566d819b", "metadata": {}, "source": [ - "Let's load the OpenAI Embedding class with first generation models (e.g. text-search-ada-doc-001/text-search-ada-query-001). Note: These are not recommended models - see [here](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c0b072cc", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "a56b70f5", - "metadata": {}, - "outputs": [], - "source": [ - "embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "14aefb64", - "metadata": {}, - "outputs": [], - "source": [ - "text = \"This is a test document.\"" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "3c39ed33", - "metadata": {}, - "outputs": [], - "source": [ - "query_result = embeddings.embed_query(text)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "2ee7ce9f-d506-4810-8897-e44334412714", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.004452846988523035,\n", - " 0.034550655976098514,\n", - " -0.015029939040690051,\n", - " 0.03827273883655212,\n", - " 0.005785414075152477]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query_result[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "e3221db6", - "metadata": {}, - "outputs": [], - "source": [ - "doc_result = embeddings.embed_documents([text])" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "a0865409-3a6d-468f-939f-abde17c7cac3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.004452846988523035,\n", - " 0.034550655976098514,\n", - " -0.015029939040690051,\n", - " 0.03827273883655212,\n", - " 0.005785414075152477]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "doc_result[0][:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aaad49f8", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", + "## Specify dimensions\n", "\n", - "# if you are behind an explicit proxy, you can use the OPENAI_PROXY environment variable to pass through\n", - "os.environ[\"OPENAI_PROXY\"] = \"http://proxy.yourcompany.com:8080\"" + "With the `text-embedding-3` class of models, you can specify the size of the embeddings you want returned. For example by default `text-embedding-3-large` returned embeddings of dimension 3072:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f7be1e7b-54c6-4893-b8ad-b872e6705735", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3072" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(doc_result[0])" + ] + }, + { + "cell_type": "markdown", + "id": "33287142-0835-4958-962f-385ae4447431", + "metadata": {}, + "source": [ + "But by passing in `dimensions=1024` we can reduce the size of our embeddings to 1024:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "854ee772-2de9-4a83-84e0-908033d98e4e", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_1024 = OpenAIEmbeddings(model=\"text-embedding-3-large\", dimensions=1024)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3b464396-8d94-478b-8329-849b56e1ae23", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: model not found. Using cl100k_base encoding.\n" + ] + }, + { + "data": { + "text/plain": [ + "1024" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(embeddings_1024.embed_documents([text])[0])" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "poetry-venv", "language": "python", - "name": "python3" + "name": "poetry-venv" }, "language_info": { "codemirror_mode": { @@ -248,7 +270,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.1" }, "vscode": { "interpreter": { diff --git a/libs/community/langchain_community/llms/openai.py b/libs/community/langchain_community/llms/openai.py index d9eb6b5d367..7f0aa075482 100644 --- a/libs/community/langchain_community/llms/openai.py +++ b/libs/community/langchain_community/llms/openai.py @@ -770,6 +770,7 @@ class AzureOpenAI(BaseOpenAI): .. code-block:: python from langchain_community.llms import AzureOpenAI + openai = AzureOpenAI(model_name="gpt-3.5-turbo-instruct") """ diff --git a/libs/partners/openai/langchain_openai/chat_models/azure.py b/libs/partners/openai/langchain_openai/chat_models/azure.py index 149a5a66c91..5d587205474 100644 --- a/libs/partners/openai/langchain_openai/chat_models/azure.py +++ b/libs/partners/openai/langchain_openai/chat_models/azure.py @@ -35,6 +35,8 @@ class AzureChatOpenAI(ChatOpenAI): .. code-block:: python + from langchain_openai import AzureChatOpenAI + AzureChatOpenAI( azure_deployment="35-turbo-dev", openai_api_version="2023-05-15", diff --git a/libs/partners/openai/langchain_openai/embeddings/azure.py b/libs/partners/openai/langchain_openai/embeddings/azure.py index dd99cfaaf65..ca0221252be 100644 --- a/libs/partners/openai/langchain_openai/embeddings/azure.py +++ b/libs/partners/openai/langchain_openai/embeddings/azure.py @@ -12,7 +12,19 @@ from langchain_openai.embeddings.base import OpenAIEmbeddings class AzureOpenAIEmbeddings(OpenAIEmbeddings): - """`Azure OpenAI` Embeddings API.""" + """`Azure OpenAI` Embeddings API. + + To use, you should have the + environment variable ``AZURE_OPENAI_API_KEY`` set with your API key or pass it + as a named parameter to the constructor. + + Example: + .. code-block:: python + + from langchain_openai import AzureOpenAIEmbeddings + + openai = AzureOpenAIEmbeddings(model=""text-embedding-3-large") + """ azure_endpoint: Union[str, None] = None """Your Azure endpoint, including the resource. diff --git a/libs/partners/openai/langchain_openai/embeddings/base.py b/libs/partners/openai/langchain_openai/embeddings/base.py index 5a4ead9d244..0e3c8e3eac5 100644 --- a/libs/partners/openai/langchain_openai/embeddings/base.py +++ b/libs/partners/openai/langchain_openai/embeddings/base.py @@ -38,41 +38,23 @@ class OpenAIEmbeddings(BaseModel, Embeddings): Example: .. code-block:: python - from langchain_community.embeddings import OpenAIEmbeddings - openai = OpenAIEmbeddings(openai_api_key="my-api-key") + from langchain_openai import OpenAIEmbeddings - In order to use the library with Microsoft Azure endpoints, you need to set - the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION. - The OPENAI_API_TYPE must be set to 'azure' and the others correspond to - the properties of your endpoint. - In addition, the deployment name must be passed as the model parameter. + openai = OpenAIEmbeddings(model=""text-embedding-3-large") - Example: - .. code-block:: python - - import os - - os.environ["OPENAI_API_TYPE"] = "azure" - os.environ["OPENAI_API_BASE"] = "https:// Dict[str, Any]: - return {"model": self.model, **self.model_kwargs} + params: Dict = {"model": self.model, **self.model_kwargs} + if self.dimensions is not None: + params["dimensions"] = self.dimensions + return params # please refer to # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb diff --git a/libs/partners/openai/langchain_openai/llms/azure.py b/libs/partners/openai/langchain_openai/llms/azure.py index 43b3a8335dc..d719c609015 100644 --- a/libs/partners/openai/langchain_openai/llms/azure.py +++ b/libs/partners/openai/langchain_openai/llms/azure.py @@ -32,7 +32,8 @@ class AzureOpenAI(BaseOpenAI): Example: .. code-block:: python - from langchain_community.llms import AzureOpenAI + from langchain_openai import AzureOpenAI + openai = AzureOpenAI(model_name="gpt-3.5-turbo-instruct") """ diff --git a/libs/partners/openai/poetry.lock b/libs/partners/openai/poetry.lock index a3c1c5a71e7..ee9e59a1249 100644 --- a/libs/partners/openai/poetry.lock +++ b/libs/partners/openai/poetry.lock @@ -457,13 +457,13 @@ files = [ [[package]] name = "openai" -version = "1.6.1" +version = "1.10.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.6.1-py3-none-any.whl", hash = "sha256:bc9f774838d67ac29fb24cdeb2d58faf57de8b311085dcd1348f7aa02a96c7ee"}, - {file = "openai-1.6.1.tar.gz", hash = "sha256:d553ca9dbf9486b08e75b09e8671e4f638462aaadccfced632bf490fc3d75fa2"}, + {file = "openai-1.10.0-py3-none-any.whl", hash = "sha256:aa69e97d0223ace9835fbf9c997abe9ee95318f684fd2de6d02c870700c71ebc"}, + {file = "openai-1.10.0.tar.gz", hash = "sha256:208886cb501b930dc63f48d51db9c15e5380380f80516d07332adad67c9f1053"}, ] [package.dependencies] @@ -1147,4 +1147,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "9f4b19ea531b89f5c5390782b0b205512317db0c7ec3e81c1143f1b9a146fb42" +content-hash = "689f74ee7854ade754369fd7b42f70a60ec167ee68161825b2e128324afbd90b" diff --git a/libs/partners/openai/pyproject.toml b/libs/partners/openai/pyproject.toml index 90bf94dc339..28a10daeb3b 100644 --- a/libs/partners/openai/pyproject.toml +++ b/libs/partners/openai/pyproject.toml @@ -13,7 +13,7 @@ license = "MIT" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" langchain-core = ">=0.1.16,<0.2" -openai = "^1.6.1" +openai = "^1.10.0" numpy = "^1" tiktoken = "^0.5.2" diff --git a/libs/partners/openai/tests/integration_tests/embeddings/test_base.py b/libs/partners/openai/tests/integration_tests/embeddings/test_base.py index a68715ab81a..e63e77b5a9e 100644 --- a/libs/partners/openai/tests/integration_tests/embeddings/test_base.py +++ b/libs/partners/openai/tests/integration_tests/embeddings/test_base.py @@ -3,7 +3,7 @@ from langchain_openai.embeddings.base import OpenAIEmbeddings def test_langchain_openai_embedding_documents() -> None: - """Test cohere embeddings.""" + """Test openai embeddings.""" documents = ["foo bar"] embedding = OpenAIEmbeddings() output = embedding.embed_documents(documents) @@ -12,8 +12,17 @@ def test_langchain_openai_embedding_documents() -> None: def test_langchain_openai_embedding_query() -> None: - """Test cohere embeddings.""" + """Test openai embeddings.""" document = "foo bar" embedding = OpenAIEmbeddings() output = embedding.embed_query(document) assert len(output) > 0 + + +def test_langchain_openai_embeddings_dimensions() -> None: + """Test openai embeddings.""" + documents = ["foo bar"] + embedding = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=128) + output = embedding.embed_documents(documents) + assert len(output) == 1 + assert len(output[0]) == 128