Update LangSmith Walkthrough (#11043)

2025-09-10 07:21:03 +00:00 · 2023-09-25 22:32:56 -07:00
parent bea78b3271
commit 4aec587979
4 changed files with 800 additions and 748 deletions
--- a/docs/extras/guides/langsmith/img/log_traces.png
+++ b/docs/extras/guides/langsmith/img/log_traces.png
--- a/docs/extras/guides/langsmith/img/test_results.png
+++ b/docs/extras/guides/langsmith/img/test_results.png
--- a/docs/extras/guides/langsmith/walkthrough.ipynb
+++ b/docs/extras/guides/langsmith/walkthrough.ipynb
@@ -8,6 +8,7 @@
            },
            "source": [
                "# LangSmith Walkthrough\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/./walkthrough.ipynb)\n",
                "\n",
                "LangChain makes it easy to prototype LLM applications and Agents. However, delivering LLM applications to production can be deceptively difficult. You will likely have to heavily customize and iterate on your prompts, chains, and other components to create a high-quality product.\n",
                "\n",
@@ -50,7 +51,7 @@
                "\n",
                "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
                "\n",
-    "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
+                "**NOTE:** You must also set your `OPENAI_API_KEY` environment variables in order to run the following tutorial.\n",
                "\n",
                "**NOTE:** You can only access an API key when you first create it. Keep it somewhere safe.\n",
                "\n",
@@ -67,18 +68,18 @@
        },
        {
            "cell_type": "code",
-   "execution_count": 11,
+            "execution_count": 1,
            "id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
            "metadata": {},
            "outputs": [],
            "source": [
-    "# %pip install -U langchain langsmith --quiet\n",
+                "# %pip install -U langchain langsmith langchainhub --quiet\n",
-    "# %pip install google-search-results pandas --quiet"
+                "# %pip install openai pandas duckduckgo-search --quiet"
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 1,
+            "execution_count": 2,
            "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
            "metadata": {
                "tags": []
@@ -92,11 +93,10 @@
                "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
                "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
                "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
-    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"\"  # Update to your API key\n",
+                "os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-API-KEY>\"  # Update to your API key\n",
                "\n",
                "# Used by the agent in this tutorial\n",
-    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
+                "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\""
    "# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\""
            ]
        },
        {
@@ -111,7 +111,7 @@
        },
        {
            "cell_type": "code",
-   "execution_count": 2,
+            "execution_count": 3,
            "id": "510b5ca0",
            "metadata": {
                "tags": []
@@ -128,25 +128,63 @@
            "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
            "metadata": {},
            "source": [
-    "Create a LangChain component and log runs to the platform. In this example, we will create a ReAct-style agent with access to Search and Calculator as tools. However, LangSmith works regardless of which type of LangChain component you use (LLMs, Chat Models, Tools, Retrievers, Agents are all supported)."
+                "Create a LangChain component and log runs to the platform. In this example, we will create a ReAct-style agent with access to a general search tool (DuckDuckGo). The agent's prompt can be viewed in the [Hub here](https://smith.langchain.com/hub/wfh/langsmith-agent-prompt)."
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 3,
+            "execution_count": 4,
-   "id": "7c801853-8e96-404d-984c-51ace59cbbef",
+            "id": "a0fbfbba-3c82-4298-a312-9cec016d9d2e",
-   "metadata": {
+            "metadata": {},
-    "tags": []
+            "outputs": [
-   },
+                {
-   "outputs": [],
+                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "/Users/wfh/code/lc/langchain/libs/langchain/langchain/__init__.py:24: UserWarning: Importing hub from langchain root module is no longer supported.\n",
                        "  warnings.warn(\n"
                    ]
                }
            ],
            "source": [
                "from langchain import hub\n",
                "from langchain.agents import AgentExecutor\n",
                "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
                "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
                "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import AgentType, initialize_agent, load_tools\n",
+                "from langchain.tools import DuckDuckGoSearchResults\n",
                "from langchain.tools.render import format_tool_to_openai_function\n",
                "\n",
-    "llm = ChatOpenAI(temperature=0)\n",
+                "# Fetches the latest version of this prompt\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+                "prompt = hub.pull(\"wfh/langsmith-agent-prompt:latest\")\n",
-    "agent = initialize_agent(\n",
+                "\n",
-    "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
+                "llm = ChatOpenAI(\n",
                "    model=\"gpt-3.5-turbo-16k\",\n",
                "    temperature=0,\n",
                ")\n",
                "\n",
                "tools = [\n",
                "    DuckDuckGoSearchResults(\n",
                "        name=\"duck_duck_go\"\n",
                "    ),  # General internet search using DuckDuckGo\n",
                "]\n",
                "\n",
                "llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])\n",
                "\n",
                "runnable_agent = (\n",
                "    {\n",
                "        \"input\": lambda x: x[\"input\"],\n",
                "        \"agent_scratchpad\": lambda x: format_to_openai_functions(\n",
                "            x[\"intermediate_steps\"]\n",
                "        ),\n",
                "    }\n",
                "    | prompt\n",
                "    | llm_with_tools\n",
                "    | OpenAIFunctionsAgentOutputParser()\n",
                ")\n",
                "\n",
                "agent_executor = AgentExecutor(\n",
                "    agent=runnable_agent, tools=tools, handle_parsing_errors=True\n",
                ")"
            ]
        },
@@ -160,7 +198,7 @@
        },
        {
            "cell_type": "code",
-   "execution_count": 4,
+            "execution_count": 5,
            "id": "19537902-b95c-4390-80a4-f6c9a937081e",
            "metadata": {
                "tags": []
@@ -168,37 +206,39 @@
            "outputs": [],
            "source": [
                "inputs = [\n",
-    "    \"How many people live in canada as of 2023?\",\n",
+                "    \"What is LangChain?\",\n",
-    "    \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
+                "    \"What's LangSmith?\",\n",
-    "    \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
+                "    \"When was Llama-v2 released?\",\n",
-    "    \"how far is it from paris to boston in miles\",\n",
+                "    \"Who trained Llama-v2?\",\n",
-    "    \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n",
+                "    \"What is the langsmith cookbook?\",\n",
-    "    \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n",
+                "    \"When did langchain first announce the hub?\",\n",
    "    \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n",
    "    \"what is 153 raised to .1312 power?\",\n",
    "    \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
    "    \"what is 1213 divided by 4345?\",\n",
                "]\n",
                "\n",
-    "results = agent.batch(inputs, return_exceptions=True)"
+                "results = agent_executor.batch([{\"input\": x} for x in inputs], return_exceptions=True)"
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 5,
+            "execution_count": 6,
-   "id": "0405ff30-21fe-413d-85cf-9fa3c649efec",
+            "id": "9a6a764c-5d7a-4de7-a916-3ecc987d5bb6",
-   "metadata": {
+            "metadata": {},
-    "tags": []
+            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "[{'input': 'What is LangChain?',\n",
                            "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangChain\". Could you please provide more context or clarify your question?'},\n",
                            " {'input': \"What's LangSmith?\",\n",
                            "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangSmith\". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'}]"
                        ]
                    },
-   "outputs": [],
+                    "execution_count": 6,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
-    "from langchain.callbacks.tracers.langchain import wait_for_all_tracers\n",
+                "results[:2]"
    "\n",
    "# Logs are submitted in a background thread to avoid blocking execution.\n",
    "# For the sake of this tutorial, we want to make sure\n",
    "# they've been submitted before moving on. This is also\n",
    "# useful for serverless deployments.\n",
    "wait_for_all_tracers()"
            ]
        },
        {
@@ -206,7 +246,11 @@
            "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
            "metadata": {},
            "source": [
-    "Assuming you've successfully set up your environment, your agent traces should show up in the `Projects` section in the [app](https://smith.langchain.com/). Congrats!"
+                "Assuming you've successfully set up your environment, your agent traces should show up in the `Projects` section in the [app](https://smith.langchain.com/). Congrats!\n",
                "\n",
                "![Initial Runs](./img/log_traces.png)\n",
                "\n",
                "It looks like the agent isn't effectively using the tools though. Let's evaluate this so we have a baseline."
            ]
        },
        {
@@ -214,13 +258,13 @@
            "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
            "metadata": {},
            "source": [
-    "## Evaluate another agent implementation\n",
+                "## Evaluate Agent\n",
                "\n",
                "In addition to logging runs, LangSmith also allows you to test and evaluate your LLM applications.\n",
                "\n",
                "In this section, you will leverage LangSmith to create a benchmark dataset and run AI-assisted evaluators on an agent. You will do so in a few steps:\n",
                "\n",
-    "1. Create a dataset from pre-existing run inputs and outputs\n",
+                "1. Create a dataset\n",
                "2. Initialize a new agent to benchmark\n",
                "3. Configure evaluators to grade an agent's output\n",
                "4. Run the agent over the dataset and evaluate the results"
@@ -233,35 +277,44 @@
            "source": [
                "### 1. Create a LangSmith dataset\n",
                "\n",
-    "Below, we use the LangSmith client to create a dataset from the agent runs you just logged above. You will use these later to measure performance for a new agent. This is simply taking the inputs and outputs of the runs and saving them as examples to a dataset. A dataset is a collection of examples, which are nothing more than input-output pairs you can use as test cases to your application.\n",
+                "Below, we use the LangSmith client to create a dataset from the input questions from above and a list labels. You will use these later to measure performance for a new agent. A dataset is a collection of examples, which are nothing more than input-output pairs you can use as test cases to your application.\n",
    "\n",
    "**Note: this is a simple, walkthrough example. In a real-world setting, you'd ideally first validate the outputs before adding them to a benchmark dataset to be used for evaluating other agents.**\n",
                "\n",
                "For more information on datasets, including how to create them from CSVs or other files or how to create them in the platform, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)."
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 6,
+            "execution_count": 7,
            "id": "43fd40b2-3f02-4e51-9343-705aafe90a36",
            "metadata": {},
            "outputs": [],
            "source": [
                "outputs = [\n",
                "    \"LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.\",\n",
                "    \"LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain\",\n",
                "    \"July 18, 2023\",\n",
                "    \"The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.\",\n",
                "    \"September 5, 2023\",\n",
                "]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
-    "dataset_name = f\"calculator-example-dataset-{unique_id}\"\n",
+                "dataset_name = f\"agent-qa-{unique_id}\"\n",
                "\n",
                "dataset = client.create_dataset(\n",
-    "    dataset_name, description=\"A calculator example dataset\"\n",
+                "    dataset_name, description=\"An example dataset of questions over the LangSmith documentation.\"\n",
                ")\n",
                "\n",
-    "runs = client.list_runs(\n",
+                "for query, answer in zip(inputs, outputs):\n",
-    "    project_name=os.environ[\"LANGCHAIN_PROJECT\"],\n",
+                "    client.create_example(inputs={\"input\": query}, outputs={\"output\": answer}, dataset_id=dataset.id)"
    "    execution_order=1,  # Only return the top-level runs\n",
    "    error=False,  # Only runs that succeed\n",
    ")\n",
    "for run in runs:\n",
    "    client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
            ]
        },
        {
@@ -273,14 +326,14 @@
            "source": [
                "### 2. Initialize a new agent to benchmark\n",
                "\n",
-    "You can evaluate any LLM, chain, or agent. Since chains can have memory, we will pass in a `chain_factory` (aka a `constructor` ) function to initialize for each call.\n",
+                "LangSmith lets you evaluate any LLM, chain, agent, or even a custom function. Conversational agents are stateful (they have memory); to ensure that this state isn't shared between dataset runs, we will pass in a `chain_factory` (aka a `constructor`) function to initialize for each call.\n",
                "\n",
                "In this case, we will test an agent that uses OpenAI's function calling endpoints."
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 7,
+            "execution_count": 9,
            "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
            "metadata": {
                "tags": []
@@ -288,22 +341,30 @@
            "outputs": [],
            "source": [
                "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import AgentType, initialize_agent, load_tools\n",
+                "from langchain.agents import AgentType, initialize_agent, load_tools, AgentExecutor\n",
-    "\n",
+                "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
-    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
+                "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+                "from langchain.tools.render import format_tool_to_openai_function\n",
                "from langchain import hub\n",
                "\n",
                "\n",
                "# Since chains can be stateful (e.g. they can have memory), we provide\n",
                "# a way to initialize a new chain for each row in the dataset. This is done\n",
                "# by passing in a factory function that returns a new chain for each row.\n",
-    "def agent_factory():\n",
+                "def agent_factory(prompt):    \n",
-    "    return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
+                "    llm_with_tools = llm.bind(\n",
-    "\n",
+                "        functions=[format_tool_to_openai_function(t) for t in tools]\n",
-    "\n",
+                "    )\n",
-    "# If your chain is NOT stateful, your factory can return the object directly\n",
+                "    runnable_agent = (\n",
-    "# to improve runtime performance. For example:\n",
+                "            {\n",
-    "# chain_factory = lambda: agent"
+                "                \"input\": lambda x: x[\"input\"],\n",
                "                \"agent_scratchpad\": lambda x: format_to_openai_functions(x['intermediate_steps'])\n",
                "            } \n",
                "             | prompt \n",
                "             | llm_with_tools \n",
                "             | OpenAIFunctionsAgentOutputParser()\n",
                "    )\n",
                "    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)\n"
            ]
        },
        {
@@ -317,7 +378,7 @@
                "It can be helpful to use automated metrics and AI-assisted feedback to evaluate your component's performance.\n",
                "\n",
                "Below, we will create some pre-implemented run evaluators that do the following:\n",
-    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
+                "- Compare results against ground truth labels.\n",
                "- Measure semantic (dis)similarity using embedding distance\n",
                "- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
                "\n",
@@ -327,7 +388,7 @@
        },
        {
            "cell_type": "code",
-   "execution_count": 8,
+            "execution_count": 10,
            "id": "a25dc281",
            "metadata": {
                "tags": []
@@ -351,7 +412,7 @@
                "        # Both the Criteria and LabeledCriteria evaluators can be configured with a dictionary of custom criteria.\n",
                "        RunEvalConfig.Criteria(\n",
                "            {\n",
-    "                \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
+                "                \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question? Y if so.\"\n",
                "            }\n",
                "        ),\n",
                "    ],\n",
@@ -370,9 +431,9 @@
            "source": [
                "### 4. Run the agent and evaluators\n",
                "\n",
-    "Use the [arun_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.arun_on_dataset.html#langchain.smith.evaluation.runner_utils.arun_on_dataset) (or synchronous [run_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.run_on_dataset.html#langchain.smith.evaluation.runner_utils.run_on_dataset)) function to evaluate your model. This will:\n",
+                "Use the [run_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.run_on_dataset.html#langchain.smith.evaluation.runner_utils.run_on_dataset) (or asynchronous [arun_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.arun_on_dataset.html#langchain.smith.evaluation.runner_utils.arun_on_dataset)) function to evaluate your model. This will:\n",
-    "1. Fetch example rows from the specified dataset\n",
+                "1. Fetch example rows from the specified dataset.\n",
-    "2. Run your llm or chain on each example.\n",
+                "2. Run your agent (or any custom function) on each example.\n",
                "3. Apply evalutors to the resulting run traces and corresponding reference examples to generate automated feedback.\n",
                "\n",
                "The results will be visible in the LangSmith app."
@@ -380,51 +441,96 @@
        },
        {
            "cell_type": "code",
-   "execution_count": 9,
+            "execution_count": 11,
            "id": "af8c8469-d70d-46d9-8fcd-517a1ccc7c4b",
            "metadata": {},
            "outputs": [],
            "source": [
                "from langchain import hub\n",
                "\n",
                "# We will test this version of the prompt\n",
                "prompt = hub.pull(\"wfh/langsmith-agent-prompt:798e7324\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "View the evaluation results for project 'runnable-agent-test-5d466cbc-1d7fb5e9' at:\n",
                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c61cbfbc-2d55-4763-9844-92fd528637fe\n",
                        "[>                                                 ] 0/5"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
-      "Chain failed for example f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 with inputs {'input': \"what is dua lipa's boyfriend age raised to the .43 power?\"}\n",
+                        "Chain failed for example f75887fc-8b00-4123-b0f9-8b12b1ed4a32 with inputs {'input': 'Who trained Llama-v2?'}\n",
-      "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
+                        "Error Type: TypeError, Message: DuckDuckGoSearchResults._run() got an unexpected keyword argument 'arg1'\n"
-      "age_of_Dua_Lipa_boyfriend ** 0.43\n",
+                    ]
-      "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n",
+                },
-      "Chain failed for example 78c959a4-467d-4469-8bd7-c5f0b059bc4a with inputs {'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\"}\n",
+                {
-      "Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
+                    "name": "stdout",
-      "age ** 0.43\n",
+                    "output_type": "stream",
-      "\") raised error: 'age'. Please try again with a valid numerical expression\n",
+                    "text": [
-      "Chain failed for example 6de48a56-3f30-4aac-b6cf-eee4b05ad43f with inputs {'input': \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\"}\n",
+                        "[--------------------------------------->          ] 4/5\n",
-      "Error Type: ToolException, Message: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 72}]\n"
+                        " Eval quantiles:\n",
                        "                               0.25       0.5      0.75     mean      mode\n",
                        "embedding_cosine_distance  0.085513  0.143134  0.219651  0.16203  0.044081\n",
                        "correctness                0.000000  0.500000  1.000000  0.50000  0.000000\n",
                        "fifth-grader-score         1.000000  1.000000  1.000000  1.00000  1.000000\n",
                        "helpfulness                0.750000  1.000000  1.000000  0.75000  1.000000\n"
                    ]
                }
            ],
            "source": [
                "import functools\n",
                "from langchain.smith import (\n",
                "    arun_on_dataset,\n",
                "    run_on_dataset, \n",
                ")\n",
                "\n",
                "chain_results = run_on_dataset(\n",
    "    client=client,\n",
                "    dataset_name=dataset_name,\n",
-    "    llm_or_chain_factory=agent_factory,\n",
+                "    llm_or_chain_factory=functools.partial(agent_factory, prompt=prompt),\n",
                "    evaluation=evaluation_config,\n",
                "    verbose=True,\n",
-    "    tags=[\"testing-notebook\"],  # Optional, adds a tag to the resulting chain runs\n",
+                "    client=client,\n",
                "    project_name=f\"runnable-agent-test-5d466cbc-{unique_id}\",\n",
                "    tags=[\"testing-notebook\", \"prompt:5d466cbc\"],  # Optional, adds a tag to the resulting chain runs\n",
                ")\n",
                "\n",
                "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
                "# These are logged as warnings here and captured as errors in the tracing UI."
            ]
        },
        {
            "cell_type": "markdown",
            "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
            "metadata": {
                "tags": []
            },
            "source": [
                "### Review the test results\n",
                "\n",
                "You can review the test results tracing UI below by clicking the URL in the output above or navigating to the \"Testing & Datasets\" page in LangSmith  **\"agent-qa-{unique_id}\"** dataset. \n",
                "\n",
                "![test results](./img/test_results.png)\n",
                "\n",
                "This will show the new runs and the feedback logged from the selected evaluators. You can also explore a summary of the results in tabular format below."
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 10,
+            "execution_count": 13,
            "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
            "metadata": {},
            "outputs": [
@@ -449,183 +555,108 @@
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>embedding_cosine_distance</th>\n",
                            "      <th>correctness</th>\n",
                            "      <th>fifth-grader-score</th>\n",
                            "      <th>helpfulness</th>\n",
                            "      <th>input</th>\n",
                            "      <th>output</th>\n",
                            "      <th>reference</th>\n",
       "      <th>embedding_cosine_distance</th>\n",
       "      <th>correctness</th>\n",
       "      <th>helpfulness</th>\n",
       "      <th>fifth-grader-score</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
-       "      <th>78c959a4-467d-4469-8bd7-c5f0b059bc4a</th>\n",
+                            "      <th>0eefc54a-066f-426b-883a-b096a09f6288</th>\n",
-       "      <td>{'input': 'who is dua lipa's boyfriend? what i...</td>\n",
+                            "      <td>0.317772</td>\n",
       "      <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
       "      <td>{'output': 'Romain Gavras' age raised to the 0...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>f8dfff24-d288-4d8e-ba94-c3cc33dd10d0</th>\n",
       "      <td>{'input': 'what is dua lipa's boyfriend age ra...</td>\n",
       "      <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
       "      <td>{'output': 'Approximately 4.9888126515157.'}</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c78d5e84-3fbd-442f-affb-4b0e5806c439</th>\n",
       "      <td>{'input': 'how far is it from paris to boston ...</td>\n",
       "      <td>{'input': 'how far is it from paris to boston ...</td>\n",
       "      <td>{'output': 'The distance from Paris to Boston ...</td>\n",
       "      <td>0.007577</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>02cadef9-5794-49a9-8e43-acca977cab60</th>\n",
       "      <td>{'input': 'How many people live in canada as o...</td>\n",
       "      <td>{'input': 'How many people live in canada as o...</td>\n",
       "      <td>{'output': 'The current population of Canada a...</td>\n",
       "      <td>0.016324</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e888a340-0486-4552-bb4b-911756e6bed7</th>\n",
       "      <td>{'input': 'what was the total number of points...</td>\n",
       "      <td>{'input': 'what was the total number of points...</td>\n",
       "      <td>{'output': '3'}</td>\n",
       "      <td>0.225076</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
                            "      <td>0.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>{'input': 'What is the langsmith cookbook?'}</td>\n",
                            "      <td>{'input': 'What is the langsmith cookbook?', '...</td>\n",
                            "      <td>{'output': 'September 5, 2023'}</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
-       "      <th>1b1f655b-754c-474d-8832-e6ec6bad3943</th>\n",
+                            "      <th>f75887fc-8b00-4123-b0f9-8b12b1ed4a32</th>\n",
       "      <td>{'input': 'what was the total number of points...</td>\n",
       "      <td>{'input': 'what was the total number of points...</td>\n",
       "      <td>{'output': 'The total number of points scored ...</td>\n",
       "      <td>0.011580</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51f1b1f1-3b51-400f-b871-65f8a3a3c2d4</th>\n",
       "      <td>{'input': 'how many more points were scored in...</td>\n",
       "      <td>{'input': 'how many more points were scored in...</td>\n",
       "      <td>{'output': '15'}</td>\n",
       "      <td>0.251002</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83339364-0135-4efd-a24a-f3bd2a85e33a</th>\n",
       "      <td>{'input': 'what is 153 raised to .1312 power?'}</td>\n",
       "      <td>{'input': 'what is 153 raised to .1312 power?'...</td>\n",
       "      <td>{'output': '1.9347796717823205'}</td>\n",
       "      <td>0.127441</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6de48a56-3f30-4aac-b6cf-eee4b05ad43f</th>\n",
       "      <td>{'input': 'who is kendall jenner's boyfriend? ...</td>\n",
       "      <td>{'Error': 'ToolException(\"Too many arguments t...</td>\n",
       "      <td>{'output': 'Bad Bunny's height raised to the p...</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>NaN</td>\n",
                            "      <td>{'input': 'Who trained Llama-v2?'}</td>\n",
                            "      <td>{'Error': 'TypeError(\"DuckDuckGoSearchResults....</td>\n",
                            "      <td>{'output': 'The langsmith cookbook is a github...</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
-       "      <th>0c41cc28-9c07-4550-8940-68b58cbc045e</th>\n",
+                            "      <th>3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6</th>\n",
-       "      <td>{'input': 'what is 1213 divided by 4345?'}</td>\n",
+                            "      <td>0.186944</td>\n",
       "      <td>{'input': 'what is 1213 divided by 4345?', 'ou...</td>\n",
       "      <td>{'output': '0.2791714614499425'}</td>\n",
       "      <td>0.144522</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>{'input': 'When was Llama-v2 released?'}</td>\n",
                            "      <td>{'input': 'When was Llama-v2 released?', 'outp...</td>\n",
                            "      <td>{'output': 'July 18, 2023'}</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>244cff41-198e-4441-8248-d095bebbfd16</th>\n",
                            "      <td>0.044081</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>{'input': 'What's LangSmith?'}</td>\n",
                            "      <td>{'input': 'What's LangSmith?', 'output': 'Lang...</td>\n",
                            "      <td>{'output': 'LangSmith is a unified platform fo...</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2272dbe6-0b6f-476b-b9d4-48c38f481a12</th>\n",
                            "      <td>0.099323</td>\n",
                            "      <td>0.0</td>\n",
                            "      <td>1.0</td>\n",
                            "      <td>0.0</td>\n",
                            "      <td>{'input': 'What is LangChain?'}</td>\n",
                            "      <td>{'input': 'What is LangChain?', 'output': 'Lan...</td>\n",
                            "      <td>{'output': 'LangChain is an open-source framew...</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "                                      embedding_cosine_distance  correctness  \\\n",
                            "0eefc54a-066f-426b-883a-b096a09f6288                   0.317772          0.0   \n",
                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32                        NaN          NaN   \n",
                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                   0.186944          1.0   \n",
                            "244cff41-198e-4441-8248-d095bebbfd16                   0.044081          1.0   \n",
                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12                   0.099323          0.0   \n",
                            "\n",
                            "                                      fifth-grader-score  helpfulness  \\\n",
                            "0eefc54a-066f-426b-883a-b096a09f6288                 1.0          1.0   \n",
                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32                 NaN          NaN   \n",
                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                 1.0          1.0   \n",
                            "244cff41-198e-4441-8248-d095bebbfd16                 1.0          1.0   \n",
                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12                 1.0          0.0   \n",
                            "\n",
                            "                                                                             input  \\\n",
-       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'input': 'who is dua lipa's boyfriend? what i...   \n",
+                            "0eefc54a-066f-426b-883a-b096a09f6288  {'input': 'What is the langsmith cookbook?'}   \n",
-       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0  {'input': 'what is dua lipa's boyfriend age ra...   \n",
+                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32            {'input': 'Who trained Llama-v2?'}   \n",
-       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'input': 'how far is it from paris to boston ...   \n",
+                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6      {'input': 'When was Llama-v2 released?'}   \n",
-       "02cadef9-5794-49a9-8e43-acca977cab60  {'input': 'How many people live in canada as o...   \n",
+                            "244cff41-198e-4441-8248-d095bebbfd16                {'input': 'What's LangSmith?'}   \n",
-       "e888a340-0486-4552-bb4b-911756e6bed7  {'input': 'what was the total number of points...   \n",
+                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12               {'input': 'What is LangChain?'}   \n",
       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'input': 'what was the total number of points...   \n",
       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4  {'input': 'how many more points were scored in...   \n",
       "83339364-0135-4efd-a24a-f3bd2a85e33a    {'input': 'what is 153 raised to .1312 power?'}   \n",
       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'input': 'who is kendall jenner's boyfriend? ...   \n",
       "0c41cc28-9c07-4550-8940-68b58cbc045e         {'input': 'what is 1213 divided by 4345?'}   \n",
                            "\n",
                            "                                                                                 output  \\\n",
-       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'Error': 'ValueError('LLMMathChain._evaluate(...   \n",
+                            "0eefc54a-066f-426b-883a-b096a09f6288  {'input': 'What is the langsmith cookbook?', '...   \n",
-       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0  {'Error': 'ValueError('LLMMathChain._evaluate(...   \n",
+                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32  {'Error': 'TypeError(\"DuckDuckGoSearchResults....   \n",
-       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'input': 'how far is it from paris to boston ...   \n",
+                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6  {'input': 'When was Llama-v2 released?', 'outp...   \n",
-       "02cadef9-5794-49a9-8e43-acca977cab60  {'input': 'How many people live in canada as o...   \n",
+                            "244cff41-198e-4441-8248-d095bebbfd16  {'input': 'What's LangSmith?', 'output': 'Lang...   \n",
-       "e888a340-0486-4552-bb4b-911756e6bed7  {'input': 'what was the total number of points...   \n",
+                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12  {'input': 'What is LangChain?', 'output': 'Lan...   \n",
       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'input': 'what was the total number of points...   \n",
       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4  {'input': 'how many more points were scored in...   \n",
       "83339364-0135-4efd-a24a-f3bd2a85e33a  {'input': 'what is 153 raised to .1312 power?'...   \n",
       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'Error': 'ToolException(\"Too many arguments t...   \n",
       "0c41cc28-9c07-4550-8940-68b58cbc045e  {'input': 'what is 1213 divided by 4345?', 'ou...   \n",
                            "\n",
-       "                                                                              reference  \\\n",
+                            "                                                                              reference  \n",
-       "78c959a4-467d-4469-8bd7-c5f0b059bc4a  {'output': 'Romain Gavras' age raised to the 0...   \n",
+                            "0eefc54a-066f-426b-883a-b096a09f6288                    {'output': 'September 5, 2023'}  \n",
-       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0       {'output': 'Approximately 4.9888126515157.'}   \n",
+                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32  {'output': 'The langsmith cookbook is a github...  \n",
-       "c78d5e84-3fbd-442f-affb-4b0e5806c439  {'output': 'The distance from Paris to Boston ...   \n",
+                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                        {'output': 'July 18, 2023'}  \n",
-       "02cadef9-5794-49a9-8e43-acca977cab60  {'output': 'The current population of Canada a...   \n",
+                            "244cff41-198e-4441-8248-d095bebbfd16  {'output': 'LangSmith is a unified platform fo...  \n",
-       "e888a340-0486-4552-bb4b-911756e6bed7                                    {'output': '3'}   \n",
+                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12  {'output': 'LangChain is an open-source framew...  "
       "1b1f655b-754c-474d-8832-e6ec6bad3943  {'output': 'The total number of points scored ...   \n",
       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4                                   {'output': '15'}   \n",
       "83339364-0135-4efd-a24a-f3bd2a85e33a                   {'output': '1.9347796717823205'}   \n",
       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f  {'output': 'Bad Bunny's height raised to the p...   \n",
       "0c41cc28-9c07-4550-8940-68b58cbc045e                   {'output': '0.2791714614499425'}   \n",
       "\n",
       "                                      embedding_cosine_distance  correctness  \\\n",
       "78c959a4-467d-4469-8bd7-c5f0b059bc4a                        NaN          NaN   \n",
       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0                        NaN          NaN   \n",
       "c78d5e84-3fbd-442f-affb-4b0e5806c439                   0.007577          1.0   \n",
       "02cadef9-5794-49a9-8e43-acca977cab60                   0.016324          1.0   \n",
       "e888a340-0486-4552-bb4b-911756e6bed7                   0.225076          0.0   \n",
       "1b1f655b-754c-474d-8832-e6ec6bad3943                   0.011580          0.0   \n",
       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4                   0.251002          1.0   \n",
       "83339364-0135-4efd-a24a-f3bd2a85e33a                   0.127441          1.0   \n",
       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f                        NaN          NaN   \n",
       "0c41cc28-9c07-4550-8940-68b58cbc045e                   0.144522          1.0   \n",
       "\n",
       "                                      helpfulness  fifth-grader-score  \n",
       "78c959a4-467d-4469-8bd7-c5f0b059bc4a          NaN                 NaN  \n",
       "f8dfff24-d288-4d8e-ba94-c3cc33dd10d0          NaN                 NaN  \n",
       "c78d5e84-3fbd-442f-affb-4b0e5806c439          1.0                 1.0  \n",
       "02cadef9-5794-49a9-8e43-acca977cab60          1.0                 1.0  \n",
       "e888a340-0486-4552-bb4b-911756e6bed7          0.0                 0.0  \n",
       "1b1f655b-754c-474d-8832-e6ec6bad3943          0.0                 0.0  \n",
       "51f1b1f1-3b51-400f-b871-65f8a3a3c2d4          1.0                 1.0  \n",
       "83339364-0135-4efd-a24a-f3bd2a85e33a          1.0                 1.0  \n",
       "6de48a56-3f30-4aac-b6cf-eee4b05ad43f          NaN                 NaN  \n",
       "0c41cc28-9c07-4550-8940-68b58cbc045e          1.0                 1.0  "
                        ]
                    },
-     "execution_count": 10,
+                    "execution_count": 13,
                    "metadata": {},
                    "output_type": "execute_result"
                }
@@ -636,16 +667,48 @@
        },
        {
            "cell_type": "markdown",
-   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
+            "id": "13aad317-73ff-46a7-a5a0-60b5b5295f02",
-   "metadata": {
+            "metadata": {},
    "tags": []
   },
            "source": [
-    "### Review the test results\n",
+                "### (Optional) Compare to another prompt\n",
                "\n",
-    "You can review the test results tracing UI below by navigating to the \"Datasets & Testing\" page and selecting the **\"calculator-example-dataset-*\"** dataset, clicking on the `Test Runs` tab, then inspecting the runs in the corresponding project. \n",
+                "Now that we have our test run results, we can make changes to our agent and benchmark them. Let's try this again with a different prompt and see the results."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "id": "5eeb023f-ded2-4d0f-b910-2a57d9675853",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "View the evaluation results for project 'runnable-agent-test-39f3bbd0-1d7fb5e9' at:\n",
                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/04b8a9ba-37a0-44e6-9e46-aa231be64c6d\n",
                        "[------------------------------------------------->] 5/5\n",
                        " Eval quantiles:\n",
                        "                               0.25       0.5      0.75      mean      mode\n",
                        "embedding_cosine_distance  0.067353  0.138474  0.243327  0.163574  0.050973\n",
                        "correctness                0.000000  1.000000  1.000000  0.600000  1.000000\n",
                        "helpfulness                1.000000  1.000000  1.000000  0.800000  1.000000\n",
                        "fifth-grader-score         1.000000  1.000000  1.000000  1.000000  1.000000\n"
                    ]
                }
            ],
            "source": [
                "candidate_prompt = hub.pull(\"wfh/langsmith-agent-prompt:39f3bbd0\")\n",
                "\n",
-    "This will show the new runs and the feedback logged from the selected evaluators. Note that runs that error out will not have feedback."
+                "chain_results = run_on_dataset(\n",
                "    dataset_name=dataset_name,\n",
                "    llm_or_chain_factory=functools.partial(agent_factory, prompt=candidate_prompt),\n",
                "    evaluation=evaluation_config,\n",
                "    verbose=True,\n",
                "    client=client,\n",
                "    project_name=f\"runnable-agent-test-39f3bbd0-{unique_id}\",\n",
                "    tags=[\"testing-notebook\", \"prompt:39f3bbd0\"],  # Optional, adds a tag to the resulting chain runs\n",
                ")"
            ]
        },
        {
@@ -655,52 +718,31 @@
            "source": [
                "## Exporting datasets and runs\n",
                "\n",
-    "LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run."
+                "LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run.\n",
                "\n",
                "**Note: It may be a few moments before all the runs are accessible.**"
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 18,
+            "execution_count": 15,
            "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
            "metadata": {
                "tags": []
            },
-   "outputs": [
+            "outputs": [],
    {
     "data": {
      "text/plain": [
       "Run(id=UUID('a6893e95-a9cc-43e0-b9fa-f471b0cfee83'), name='AgentExecutor', start_time=datetime.datetime(2023, 9, 13, 22, 34, 32, 177406), run_type='chain', end_time=datetime.datetime(2023, 9, 13, 22, 34, 37, 77740), extra={'runtime': {'cpu': {'time': {'sys': 3.153218304, 'user': 5.045262336}, 'percent': 0.0, 'ctx_switches': {'voluntary': 42164.0, 'involuntary': 0.0}}, 'mem': {'rss': 184205312.0}, 'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.26', 'thread_count': 58.0, 'library_version': '0.0.286', 'runtime_version': '3.11.2', 'langchain_version': '0.0.286', 'py_implementation': 'CPython'}}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-09-13T22:34:32.177406'}, {'name': 'end', 'time': '2023-09-13T22:34:37.077740'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('0c41cc28-9c07-4550-8940-68b58cbc045e'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), child_run_ids=[UUID('37faef05-b6b3-4cb7-a6db-471425e69b46'), UUID('2d6a895f-de2c-4f7f-b5f1-ca876d38e530'), UUID('e7d145e3-74b0-4f32-9240-3e370becdf8f'), UUID('10db62c9-fe4f-4aba-959a-ad02cfadfa20'), UUID('8dc46a27-8ab9-4f33-9ec1-660ca73ebb4f'), UUID('eccd042e-dde0-4425-b62f-e855e25d6b64')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361, 'is_all_model': True}}, app_path='/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/7865a050-467e-4c58-9322-58a26f182ecb/r/a6893e95-a9cc-43e0-b9fa-f471b0cfee83', manifest_id=None, status='success', prompt_tokens=None, completion_tokens=None, total_tokens=None, first_token_time=None, parent_run_ids=None)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
            "source": [
-    "runs = list(client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1))\n",
+                "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1)"
    "runs[0]"
            ]
        },
        {
            "cell_type": "code",
-   "execution_count": 22,
+            "execution_count": 16,
            "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
            "metadata": {
                "tags": []
            },
-   "outputs": [
+            "outputs": [],
    {
     "data": {
      "text/plain": [
       "TracerSessionResult(id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), start_time=datetime.datetime(2023, 9, 13, 22, 34, 10, 611846), name='test-dependable-stop-67', extra=None, tenant_id=UUID('ebbaf2eb-769b-4505-aca2-d11de10372a4'), run_count=None, latency_p50=None, latency_p99=None, total_tokens=None, prompt_tokens=None, completion_tokens=None, last_run_start_time=None, feedback_stats=None, reference_dataset_ids=None, run_facets=None)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
            "source": [
                "# After some time, these will be populated.\n",
                "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -640,7 +640,12 @@ async def _arun_chain(
 ) -> Union[dict, str]:
    """Run a chain asynchronously on inputs."""
    inputs_ = inputs if input_mapper is None else input_mapper(inputs)
-    if isinstance(chain, Chain) and isinstance(inputs_, dict) and len(inputs_) == 1:
+    if (
        isinstance(chain, Chain)
        and isinstance(inputs_, dict)
        and len(inputs_) == 1
        and chain.input_keys
    ):
        val = next(iter(inputs_.values()))
        output = await chain.acall(val, callbacks=callbacks, tags=tags)
    else:
@@ -765,7 +770,12 @@ def _run_chain(
 ) -> Union[Dict, str]:
    """Run a chain on inputs."""
    inputs_ = inputs if input_mapper is None else input_mapper(inputs)
-    if isinstance(chain, Chain) and isinstance(inputs_, dict) and len(inputs_) == 1:
+    if (
        isinstance(chain, Chain)
        and isinstance(inputs_, dict)
        and len(inputs_) == 1
        and chain.input_keys
    ):
        val = next(iter(inputs_.values()))
        output = chain(val, callbacks=callbacks, tags=tags)
    else: