From e00cc74926863ca2f478b2a6735388f84deeca7d Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Fri, 26 Jul 2024 10:29:06 -0400 Subject: [PATCH] docs[minor]: Add how to guide for rate limiting a chat model (#24686) Add how-to guide for rate limiting a chat model. --- .../how_to/chat_model_rate_limiting.ipynb | 146 ++++++++++++++++++ docs/docs/how_to/index.mdx | 1 + 2 files changed, 147 insertions(+) create mode 100644 docs/docs/how_to/chat_model_rate_limiting.ipynb diff --git a/docs/docs/how_to/chat_model_rate_limiting.ipynb b/docs/docs/how_to/chat_model_rate_limiting.ipynb new file mode 100644 index 00000000000..d34084becbe --- /dev/null +++ b/docs/docs/how_to/chat_model_rate_limiting.ipynb @@ -0,0 +1,146 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dcf87b32", + "metadata": {}, + "source": [ + "# How to handle rate limits\n", + "\n", + ":::info Prerequisites\n", + "\n", + "This guide assumes familiarity with the following concepts:\n", + "- [Chat models](/docs/concepts/#chat-models)\n", + "- [LLMs](/docs/concepts/#llms)\n", + ":::\n", + "\n", + "\n", + "You may find yourself in a situation where you are getting rate limited by the model provider API because you're making too many requests.\n", + "\n", + "For example, this might happen if you are running many parallel queries to benchmark the chat model on a test dataset.\n", + "\n", + "If you are facing such a situation, you can use a rate limiter to help match the rate at which you're making request to the rate allowed\n", + "by the API.\n", + "\n", + ":::info Requires ``langchain-core >= 0.2.24``\n", + "\n", + "This functionality was added in ``langchain-core == 0.2.24``. Please make sure your package is up to date.\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "cbc3c873-6109-4e03-b775-b73c1003faea", + "metadata": {}, + "source": [ + "## Initialize a rate limiter\n", + "\n", + "Langchain comes with a built-in in memory rate limiter. This rate limiter is thread safe and can be shared by multiple threads in the same process.\n", + "\n", + "The provided rate limiter can only limit the number of requests per unit time. It will not help if you need to also limited based on the size\n", + "of the requests." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aa9c3c8c-0464-4190-a8c5-d69d173505a6", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.rate_limiters import InMemoryRateLimiter\n", + "\n", + "rate_limiter = InMemoryRateLimiter(\n", + " requests_per_second=0.1, # <-- Super slow! We can only make a request once every 10 seconds!!\n", + " check_every_n_seconds=0.1, # Wake up every 100 ms to check whether allowed to make a request,\n", + " max_bucket_size=10, # Controls the maximum burst size.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8e058bde-9413-4b08-8cc6-0c9cb638f19f", + "metadata": {}, + "source": [ + "## Choose a model\n", + "\n", + "Choose any model and pass to it the rate_limiter via the `rate_limiter` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0f880a3a-c047-4e94-a323-fff2a4c0e96d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "from getpass import getpass\n", + "\n", + "if \"ANTHROPIC_API_KEY\" not in os.environ:\n", + " os.environ[\"ANTHROPIC_API_KEY\"] = getpass()\n", + "\n", + "\n", + "from langchain_anthropic import ChatAnthropic\n", + "\n", + "model = ChatAnthropic(model_name=\"claude-3-opus-20240229\", rate_limiter=rate_limiter)" + ] + }, + { + "cell_type": "markdown", + "id": "80c9ab3a-299a-460f-985c-90280a046f52", + "metadata": {}, + "source": [ + "Let's confirm that the rate limiter works. We should only be able to invoke the model once per 10 seconds." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d074265c-9f32-4c5f-b914-944148993c4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11.599073648452759\n", + "10.7502121925354\n", + "10.244257926940918\n", + "8.83088755607605\n", + "11.645203590393066\n" + ] + } + ], + "source": [ + "for _ in range(5):\n", + " tic = time.time()\n", + " model.invoke(\"hello\")\n", + " toc = time.time()\n", + " print(toc - tic)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/how_to/index.mdx b/docs/docs/how_to/index.mdx index 3a4196d0917..8a113e03a61 100644 --- a/docs/docs/how_to/index.mdx +++ b/docs/docs/how_to/index.mdx @@ -83,6 +83,7 @@ These are the core building blocks you can use when building applications. - [How to: track response metadata across providers](/docs/how_to/response_metadata) - [How to: use chat model to call tools](/docs/how_to/tool_calling) - [How to: stream tool calls](/docs/how_to/tool_streaming) +- [How to: handle rate limits](/docs/how_to/chat_model_rate_limiting) - [How to: few shot prompt tool behavior](/docs/how_to/tools_few_shot) - [How to: bind model-specific formatted tools](/docs/how_to/tools_model_specific) - [How to: force a specific tool call](/docs/how_to/tool_choice)