From 92a91e54fbdad66a45fca36e288f919dbaf33c6f Mon Sep 17 00:00:00 2001 From: vowelparrot <130414180+vowelparrot@users.noreply.github.com> Date: Wed, 3 May 2023 17:28:55 -0700 Subject: [PATCH] Rerun --- .../evaluating_traced_examples.ipynb | 658 +++++++++--------- 1 file changed, 333 insertions(+), 325 deletions(-) diff --git a/docs/use_cases/evaluation/evaluating_traced_examples.ipynb b/docs/use_cases/evaluation/evaluating_traced_examples.ipynb index e54bbc9c373..15d785f50ad 100644 --- a/docs/use_cases/evaluation/evaluating_traced_examples.ipynb +++ b/docs/use_cases/evaluation/evaluating_traced_examples.ipynb @@ -1,327 +1,335 @@ { - "cells": [{ - "cell_type": "markdown", - "id": "1a4596ea-a631-416d-a2a4-3577c140493d", - "metadata": {}, - "source": [ - "## Running Chains on Traced Datasets\n", - "\n", - "Developing applications with language models can be uniquely challenging. To manage this complexity and ensure reliable performance, LangChain provides tracing and evaluation functionality through . This notebook demonstrates how to run Chains, which are language model functions, on previously captured datasets or traces. Some common use cases for this approach include:\n", - "\n", - "- Running an evaluation chain to grade previous runs.\n", - "- Comparing different chains, LLMs, and agents on traced datasets.\n", - "- Executing a stochastic chain multiple times over a dataset to generate metrics before deployment.\n", - "\n", - "Please note that this notebook assumes you have LangChain+ tracing running in the background. To set it up, follow the [tracing directions here](..\\/..\\/tracing\\/local_installation.md).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "904db9a5-f387-4a57-914c-c8af8d39e249", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.client import LangChainClient\n", - "\n", - "client = LangChainClient()" - ] - }, - { - "cell_type": "markdown", - "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3", - "metadata": {}, - "source": [ - "## Seed an example dataset\n", - "\n", - "If you have been using LangChainPlus already, you may have datasets available. You can generate these from `Run`'s captured through the LangChain tracing API.\n", - "\n", - "```\n", - "datasets = client.list_datasets()\n", - "datasets\n", - "```\n", - "\n", - "Assuming you're running locally for the first time, you may not have runs stored, so we'll first upload an example evaluation dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1baa677c-5642-4378-8e01-3aa1647f19d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# !pip install datasets > /dev/null\n", - "# !pip install pandas > /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "60d14593-c61f-449f-a38f-772ca43707c2", - "metadata": { - "tags": [] - }, - "outputs": [{ - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset json (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e84b657581c0444ebfb677a5644c4f79", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
inputoutput
0How many people live in canada as of 2023?approximately 38,625,801
1who is dua lipa's boyfriend? what is his age r...her boyfriend is Romain Gravas. his age raised...
2what is dua lipa's boyfriend age raised to the...her boyfriend is Romain Gravas. his age raised...
3how far is it from paris to boston in milesapproximately 3,435 mi
4what was the total number of points scored in ...approximately 2.682651500990882
\n", - "" - ], - "text/plain": [ - " input \\\n", - "0 How many people live in canada as of 2023? \n", - "1 who is dua lipa's boyfriend? what is his age r... \n", - "2 what is dua lipa's boyfriend age raised to the... \n", - "3 how far is it from paris to boston in miles \n", - "4 what was the total number of points scored in ... \n", - "\n", - " output \n", - "0 approximately 38,625,801 \n", - "1 her boyfriend is Romain Gravas. his age raised... \n", - "2 her boyfriend is Romain Gravas. his age raised... \n", - "3 approximately 3,435 mi \n", - "4 approximately 2.682651500990882 " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "from langchain.evaluation.loading import load_dataset\n", - "\n", - "dataset = load_dataset(\"agent-search-calculator\")\n", - "df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n", - "df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key \n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f27e45f1-e299-4de8-a538-ee1272ac5024", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dataset_name = f\"calculator_example.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "52a7ea76-79ca-4765-abf7-231e884040d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n", - " dataset = client.upload_dataframe(df, \n", - " name=dataset_name,\n", - " description=\"Acalculator example dataset\",\n", - " input_keys=[\"input\"],\n", - " output_keys=[\"output\"],\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "07885b10", - "metadata": { - "tags": [] - }, - "source": [ - "## Running a Chain on a Traced Dataset\n", - "\n", - "Once you have a dataset, you can run a chain over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n", - "\n", - "**First, we'll define the chain we wish to run over the dataset.**\n", - "\n", - "In this case, we're using an agent, but it can be any simple chain." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.agents import initialize_agent, load_tools\n", - "from langchain.agents import AgentType\n", - "\n", - "llm = ChatOpenAI(temperature=0)\n", - "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n", - "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)" - ] - }, - { - "cell_type": "markdown", - "id": "84094a4a-1d76-461c-bc37-8c537939b466", - "metadata": {}, - "source": [ - "**Now we're ready to run the chain!**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33", - "metadata": { - "tags": [] - }, - "outputs": [{ - "name": "stderr", - "output_type": "stream", - "text": [ - "Chain failed for example e290c4f5-0d56-4848-91e6-d794cfeed276. Error: 'age'. Please try again with a valid numerical expression\n", - "Chain failed for example a933211c-83c0-43c4-b22f-40e088263849. Error: invalid syntax. Perhaps you forgot a comma? (, line 1). Please try again with a valid numerical expression\n" - ] - }], - "source": [ - "chain_results = await client.arun_chain_on_dataset(\n", - " dataset_name=dataset_name,\n", - " chain=agent,\n", - " batch_size=5, # Optional, sets the number of examples to run at a time\n", - " session_name=\"Calculator Dataset Runs\", # Optional. Will be sent to the 'default' session otherwise\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "136db492-d6ca-4215-96f9-439c23538241", - "metadata": { - "tags": [] - }, - "outputs": [{ - "data": { - "text/markdown": [ - "[Click here to visit the local LangChain+ Server](http://localhost/)" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }], - "source": [ - "from IPython.display import display, Markdown\n", - "\n", - "display(Markdown('[Click here to visit the local LangChain+ Server](http://localhost/)'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e5583cf2-0bc9-44d0-987a-b429f8f4e67d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } + "cells": [ + { + "cell_type": "markdown", + "id": "1a4596ea-a631-416d-a2a4-3577c140493d", + "metadata": {}, + "source": [ + "## Running Chains on Traced Datasets\n", + "\n", + "Developing applications with language models can be uniquely challenging. To manage this complexity and ensure reliable performance, LangChain provides tracing and evaluation functionality through . This notebook demonstrates how to run Chains, which are language model functions, on previously captured datasets or traces. Some common use cases for this approach include:\n", + "\n", + "- Running an evaluation chain to grade previous runs.\n", + "- Comparing different chains, LLMs, and agents on traced datasets.\n", + "- Executing a stochastic chain multiple times over a dataset to generate metrics before deployment.\n", + "\n", + "Please note that this notebook assumes you have LangChain+ tracing running in the background. To set it up, follow the [tracing directions here](..\\/..\\/tracing\\/local_installation.md).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "904db9a5-f387-4a57-914c-c8af8d39e249", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.client import LangChainClient\n", + "\n", + "client = LangChainClient()" + ] + }, + { + "cell_type": "markdown", + "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3", + "metadata": {}, + "source": [ + "## Seed an example dataset\n", + "\n", + "If you have been using LangChainPlus already, you may have datasets available. You can generate these from `Run`'s captured through the LangChain tracing API.\n", + "\n", + "```\n", + "datasets = client.list_datasets()\n", + "datasets\n", + "```\n", + "\n", + "Assuming you're running locally for the first time, you may not have runs stored, so we'll first upload an example evaluation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1baa677c-5642-4378-8e01-3aa1647f19d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install datasets > /dev/null\n", + "# !pip install pandas > /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "60d14593-c61f-449f-a38f-772ca43707c2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset json (/Users/vwp/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9d623c0e9719474a96241657e50ee6cc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
inputoutput
0How many people live in canada as of 2023?approximately 38,625,801
1who is dua lipa's boyfriend? what is his age r...her boyfriend is Romain Gravas. his age raised...
2what is dua lipa's boyfriend age raised to the...her boyfriend is Romain Gravas. his age raised...
3how far is it from paris to boston in milesapproximately 3,435 mi
4what was the total number of points scored in ...approximately 2.682651500990882
\n", + "" + ], + "text/plain": [ + " input \\\n", + "0 How many people live in canada as of 2023? \n", + "1 who is dua lipa's boyfriend? what is his age r... \n", + "2 what is dua lipa's boyfriend age raised to the... \n", + "3 how far is it from paris to boston in miles \n", + "4 what was the total number of points scored in ... \n", + "\n", + " output \n", + "0 approximately 38,625,801 \n", + "1 her boyfriend is Romain Gravas. his age raised... \n", + "2 her boyfriend is Romain Gravas. his age raised... \n", + "3 approximately 3,435 mi \n", + "4 approximately 2.682651500990882 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from langchain.evaluation.loading import load_dataset\n", + "\n", + "dataset = load_dataset(\"agent-search-calculator\")\n", + "df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n", + "df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key \n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f27e45f1-e299-4de8-a538-ee1272ac5024", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dataset_name = f\"calculator_example.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "52a7ea76-79ca-4765-abf7-231e884040d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n", + " dataset = client.upload_dataframe(df, \n", + " name=dataset_name,\n", + " description=\"Acalculator example dataset\",\n", + " input_keys=[\"input\"],\n", + " output_keys=[\"output\"],\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "07885b10", + "metadata": { + "tags": [] + }, + "source": [ + "## Running a Chain on a Traced Dataset\n", + "\n", + "Once you have a dataset, you can run a chain over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n", + "\n", + "**First, we'll define the chain we wish to run over the dataset.**\n", + "\n", + "In this case, we're using an agent, but it can be any simple chain." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.agents import initialize_agent, load_tools\n", + "from langchain.agents import AgentType\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n", + "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)" + ] + }, + { + "cell_type": "markdown", + "id": "84094a4a-1d76-461c-bc37-8c537939b466", + "metadata": {}, + "source": [ + "**Now we're ready to run the chain!**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Chain failed for example e290c4f5-0d56-4848-91e6-d794cfeed276. Error: 'age'. Please try again with a valid numerical expression\n", + "Chain failed for example ec590bea-8b5b-429b-a9f4-7d035e66853e. Error: unknown format from LLM: It is impossible to accurately predict the total number of points scored in a future event. Therefore, a mathematical expression cannot be provided.\n", + "Chain failed for example a933211c-83c0-43c4-b22f-40e088263849. Error: invalid syntax. Perhaps you forgot a comma? (, line 1). Please try again with a valid numerical expression\n", + "Chain failed for example b925eaa0-6b63-45c1-a3c3-c11ad53ca0eb. Error: 'VariableNode' object is not callable. Please try again with a valid numerical expression\n" + ] + } + ], + "source": [ + "chain_results = await client.arun_chain_on_dataset(\n", + " dataset_name=dataset_name,\n", + " chain=agent,\n", + " batch_size=5, # Optional, sets the number of examples to run at a time\n", + " session_name=\"Calculator Dataset Runs\", # Optional. Will be sent to the 'default' session otherwise\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "136db492-d6ca-4215-96f9-439c23538241", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "[Click here to visit the local LangChain+ Server](http://localhost/)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, Markdown\n", + "\n", + "display(Markdown('[Click here to visit the local LangChain+ Server](http://localhost/)'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5583cf2-0bc9-44d0-987a-b429f8f4e67d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}