diff --git a/docs/docs/langsmith/walkthrough.ipynb b/docs/docs/langsmith/walkthrough.ipynb
index 8c9e82bb0e5..f17cde44ca4 100644
--- a/docs/docs/langsmith/walkthrough.ipynb
+++ b/docs/docs/langsmith/walkthrough.ipynb
@@ -10,17 +10,16 @@
     "# LangSmith Walkthrough\n",
     "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/docs/langsmith/walkthrough.ipynb)\n",
     "\n",
-    "LangChain makes it easy to prototype LLM applications and Agents. However, delivering LLM applications to production can be deceptively difficult. You will likely have to heavily customize and iterate on your prompts, chains, and other components to create a high-quality product.\n",
+    "LangChain makes it easy to prototype LLM applications and Agents. However, delivering LLM applications to production can be deceptively difficult. You will have to iterate on your prompts, chains, and other components to build a high-quality product.\n",
     "\n",
-    "To aid in this process, we've launched LangSmith, a unified platform for debugging, testing, and monitoring your LLM applications.\n",
+    "LangSmith makes it easy to debug, test, and continuously improve your LLM applications.\n",
     "\n",
     "When might this come in handy? You may find it useful when you want to:\n",
     "\n",
     "- Quickly debug a new chain, agent, or set of tools\n",
-    "- Visualize how components (chains, llms, retrievers, etc.) relate and are used\n",
-    "- Evaluate different prompts and LLMs for a single component\n",
-    "- Run a given chain several times over a dataset to ensure it consistently meets a quality bar\n",
-    "- Capture usage traces and using LLMs or analytics pipelines to generate insights"
+    "- Create and manage datasets for fine-tuning, few-shot prompting, and evaluation\n",
+    "- Run regression tests on your application to confidently develop\n",
+    "- Capture production analytics for product insights and continuous improvements"
    ]
   },
   {
@@ -51,13 +50,10 @@
     "\n",
     "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
     "\n",
-    "**NOTE:** You must also set your `OPENAI_API_KEY` environment variables in order to run the following tutorial.\n",
-    "\n",
-    "**NOTE:** You can only access an API key when you first create it. Keep it somewhere safe.\n",
-    "\n",
     "**NOTE:** You can also use a context manager in python to log traces using\n",
+    "\n",
     "```python\n",
-    "from langchain.callbacks.manager import tracing_v2_enabled\n",
+    "from langchain_core.tracers.context import tracing_v2_enabled\n",
     "\n",
     "with tracing_v2_enabled(project_name=\"My Project\"):\n",
     "    agent.run(\"How many people live in canada as of 2023?\")\n",
@@ -68,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
    "metadata": {},
    "outputs": [],
@@ -79,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
    "metadata": {
     "tags": []
@@ -111,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "510b5ca0",
    "metadata": {
     "tags": []
@@ -133,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "a0fbfbba-3c82-4298-a312-9cec016d9d2e",
    "metadata": {},
    "outputs": [],
@@ -147,7 +143,7 @@
     "from langchain_openai import ChatOpenAI\n",
     "\n",
     "# Fetches the latest version of this prompt\n",
-    "prompt = hub.pull(\"wfh/langsmith-agent-prompt:latest\")\n",
+    "prompt = hub.pull(\"wfh/langsmith-agent-prompt:5d466cbc\")\n",
     "\n",
     "llm = ChatOpenAI(\n",
     "    model=\"gpt-3.5-turbo-16k\",\n",
@@ -189,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "19537902-b95c-4390-80a4-f6c9a937081e",
    "metadata": {
     "tags": []
@@ -209,7 +205,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "9a6a764c-5d7a-4de7-a916-3ecc987d5bb6",
    "metadata": {},
    "outputs": [
@@ -219,10 +215,10 @@
        "[{'input': 'What is LangChain?',\n",
        "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangChain\". Could you please provide more context or clarify your question?'},\n",
        " {'input': \"What's LangSmith?\",\n",
-       "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangSmith\". It could be a specific term or a company that is not widely known. Can you provide more context or clarify what you are referring to?'}]"
+       "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangSmith\". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'}]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -274,7 +270,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "id": "43fd40b2-3f02-4e51-9343-705aafe90a36",
    "metadata": {},
    "outputs": [],
@@ -290,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
    "metadata": {
     "tags": []
@@ -304,10 +300,11 @@
     "    description=\"An example dataset of questions over the LangSmith documentation.\",\n",
     ")\n",
     "\n",
-    "for query, answer in zip(inputs, outputs):\n",
-    "    client.create_example(\n",
-    "        inputs={\"input\": query}, outputs={\"output\": answer}, dataset_id=dataset.id\n",
-    "    )"
+    "client.create_examples(\n",
+    "    inputs=[{\"input\": query} for query in inputs],\n",
+    "    outputs=[{\"output\": answer} for answer in outputs],\n",
+    "    dataset_id=dataset.id,\n",
+    ")"
    ]
   },
   {
@@ -326,7 +323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 21,
    "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
    "metadata": {
     "tags": []
@@ -344,10 +341,7 @@
     "# Since chains can be stateful (e.g. they can have memory), we provide\n",
     "# a way to initialize a new chain for each row in the dataset. This is done\n",
     "# by passing in a factory function that returns a new chain for each row.\n",
-    "def agent_factory(prompt):\n",
-    "    llm_with_tools = llm.bind(\n",
-    "        functions=[format_tool_to_openai_function(t) for t in tools]\n",
-    "    )\n",
+    "def create_agent(prompt, llm_with_tools):\n",
     "    runnable_agent = (\n",
     "        {\n",
     "            \"input\": lambda x: x[\"input\"],\n",
@@ -372,18 +366,55 @@
     "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
     "It can be helpful to use automated metrics and AI-assisted feedback to evaluate your component's performance.\n",
     "\n",
-    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "Below, we will create a custom run evaluator that logs a heuristic evaluation.\n",
+    "\n",
+    "**Heuristic evaluators**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "331c3c53-949d-405e-8ba5-38bab1ce413b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langsmith.evaluation import EvaluationResult, run_evaluator\n",
+    "from langsmith.schemas import Example, Run\n",
+    "\n",
+    "\n",
+    "@run_evaluator\n",
+    "def check_not_idk(run: Run, example: Example):\n",
+    "    \"\"\"Illustration of a custom evaluator.\"\"\"\n",
+    "    agent_response = run.outputs[\"output\"]\n",
+    "    if \"don't know\" in agent_response or \"not sure\" in agent_response:\n",
+    "        score = 0\n",
+    "    else:\n",
+    "        score = 1\n",
+    "    # You can access the dataset labels in example.outputs[key]\n",
+    "    # You can also access the model inputs in run.inputs[key]\n",
+    "    return EvaluationResult(\n",
+    "        key=\"not_uncertain\",\n",
+    "        score=score,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad9c4791-570b-4adf-a23f-d025ff383254",
+   "metadata": {},
+   "source": [
+    "Below, we will configure the evaluation with the custom evaluator from above, as well as some pre-implemented run evaluators that do the following:\n",
     "- Compare results against ground truth labels.\n",
     "- Measure semantic (dis)similarity using embedding distance\n",
     "- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
     "\n",
     "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
-    "custom evaluators, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/).\n"
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 28,
    "id": "a25dc281",
    "metadata": {
     "tags": []
@@ -421,7 +452,7 @@
     "    ],\n",
     "    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be\n",
     "    # applied to each prediction. Check out the docs for examples.\n",
-    "    custom_evaluators=[],\n",
+    "    custom_evaluators=[check_not_idk],\n",
     ")"
    ]
   },
@@ -444,7 +475,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 29,
    "id": "af8c8469-d70d-46d9-8fcd-517a1ccc7c4b",
    "metadata": {},
    "outputs": [],
@@ -457,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 33,
    "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
    "metadata": {
     "tags": []
@@ -467,8 +498,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "View the evaluation results for project 'runnable-agent-test-5d466cbc-bf2162aa' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/0c3d22fa-f8b0-4608-b086-2187c18361a5\n",
+      "View the evaluation results for project 'runnable-agent-test-5d466cbc-97e1' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/14d8a382-3c0f-48e7-b212-33489ee8a13e/compare?selectedSessions=62f0a0c0-73bf-420c-a907-2c6b2f4625c4\n",
+      "\n",
+      "View all tests for Dataset agent-qa-e2d24144 at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/14d8a382-3c0f-48e7-b212-33489ee8a13e\n",
       "[>                                                 ] 0/5"
      ]
     },
@@ -476,43 +510,264 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Chain failed for example 54b4fce8-4492-409d-94af-708f51698b39 with inputs {'input': 'Who trained Llama-v2?'}\n",
-      "Error Type: TypeError, Message: DuckDuckGoSearchResults._run() got an unexpected keyword argument 'arg1'\n"
+      "Server error caused failure to patch https://api.smith.langchain.com/runs/e9d26fe6-bf4a-4f88-81c5-f5d0f70977f0 in LangSmith API. HTTPError('500 Server Error: Internal Server Error for url: https://api.smith.langchain.com/runs/e9d26fe6-bf4a-4f88-81c5-f5d0f70977f0', '{\"detail\":\"Internal server error\"}')\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[------------------------------------------------->] 5/5\n",
-      " Eval quantiles:\n",
-      "                               0.25       0.5      0.75      mean      mode\n",
-      "embedding_cosine_distance  0.086614  0.118841  0.183672  0.151444  0.050158\n",
-      "correctness                0.000000  0.500000  1.000000  0.500000  0.000000\n",
-      "score_string:accuracy      0.775000  1.000000  1.000000  0.775000  1.000000\n",
-      "helpfulness                0.750000  1.000000  1.000000  0.750000  1.000000\n"
+      "[------------------------------------------------->] 5/5"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.correctness</th>\n",
+       "      <th>feedback.embedding_cosine_distance</th>\n",
+       "      <th>feedback.helpfulness</th>\n",
+       "      <th>feedback.score_string:accuracy</th>\n",
+       "      <th>feedback.not_uncertain</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>run_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>e9440e33-e2c6-4bec-a190-2fa41947e9c5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.110268</td>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.640000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6.338926</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.447214</td>\n",
+       "      <td>0.060680</td>\n",
+       "      <td>0.447214</td>\n",
+       "      <td>0.328634</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.525439</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.049442</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3.858924</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.064186</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.700000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.250572</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.092256</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.700000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.817807</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.153003</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.700000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6.199289</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.192453</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10.568036</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.correctness  feedback.embedding_cosine_distance  \\\n",
+       "count               5.000000                            5.000000   \n",
+       "unique                   NaN                                 NaN   \n",
+       "top                      NaN                                 NaN   \n",
+       "freq                     NaN                                 NaN   \n",
+       "mean                0.800000                            0.110268   \n",
+       "std                 0.447214                            0.060680   \n",
+       "min                 0.000000                            0.049442   \n",
+       "25%                 1.000000                            0.064186   \n",
+       "50%                 1.000000                            0.092256   \n",
+       "75%                 1.000000                            0.153003   \n",
+       "max                 1.000000                            0.192453   \n",
+       "\n",
+       "        feedback.helpfulness  feedback.score_string:accuracy  \\\n",
+       "count               5.000000                        5.000000   \n",
+       "unique                   NaN                             NaN   \n",
+       "top                      NaN                             NaN   \n",
+       "freq                     NaN                             NaN   \n",
+       "mean                0.800000                        0.640000   \n",
+       "std                 0.447214                        0.328634   \n",
+       "min                 0.000000                        0.100000   \n",
+       "25%                 1.000000                        0.700000   \n",
+       "50%                 1.000000                        0.700000   \n",
+       "75%                 1.000000                        0.700000   \n",
+       "max                 1.000000                        1.000000   \n",
+       "\n",
+       "        feedback.not_uncertain error  execution_time  \\\n",
+       "count                      5.0     0        5.000000   \n",
+       "unique                     NaN     0             NaN   \n",
+       "top                        NaN   NaN             NaN   \n",
+       "freq                       NaN   NaN             NaN   \n",
+       "mean                       1.0   NaN        6.338926   \n",
+       "std                        0.0   NaN        2.525439   \n",
+       "min                        1.0   NaN        3.858924   \n",
+       "25%                        1.0   NaN        5.250572   \n",
+       "50%                        1.0   NaN        5.817807   \n",
+       "75%                        1.0   NaN        6.199289   \n",
+       "max                        1.0   NaN       10.568036   \n",
+       "\n",
+       "                                      run_id  \n",
+       "count                                      5  \n",
+       "unique                                     5  \n",
+       "top     e9440e33-e2c6-4bec-a190-2fa41947e9c5  \n",
+       "freq                                       1  \n",
+       "mean                                     NaN  \n",
+       "std                                      NaN  \n",
+       "min                                      NaN  \n",
+       "25%                                      NaN  \n",
+       "50%                                      NaN  \n",
+       "75%                                      NaN  \n",
+       "max                                      NaN  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
     "import functools\n",
     "\n",
-    "from langchain.smith import (\n",
-    "    arun_on_dataset,\n",
-    "    run_on_dataset,\n",
-    ")\n",
+    "from langchain.smith import arun_on_dataset, run_on_dataset\n",
     "\n",
     "chain_results = run_on_dataset(\n",
     "    dataset_name=dataset_name,\n",
-    "    llm_or_chain_factory=functools.partial(agent_factory, prompt=prompt),\n",
+    "    llm_or_chain_factory=functools.partial(\n",
+    "        create_agent, prompt=prompt, llm_with_tools=llm_with_tools\n",
+    "    ),\n",
     "    evaluation=evaluation_config,\n",
     "    verbose=True,\n",
     "    client=client,\n",
     "    project_name=f\"runnable-agent-test-5d466cbc-{unique_id}\",\n",
-    "    tags=[\n",
-    "        \"testing-notebook\",\n",
-    "        \"prompt:5d466cbc\",\n",
-    "    ],  # Optional, adds a tag to the resulting chain runs\n",
+    "    # Project metadata communicates the experiment parameters,\n",
+    "    # Useful for reviewing the test results\n",
+    "    project_metadata={\n",
+    "        \"env\": \"testing-notebook\",\n",
+    "        \"model\": \"gpt-3.5-turbo\",\n",
+    "        \"prompt\": \"5d466cbc\",\n",
+    "    },\n",
     ")\n",
     "\n",
     "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
@@ -537,7 +792,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 34,
    "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
    "metadata": {},
    "outputs": [
@@ -562,108 +817,180 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>embedding_cosine_distance</th>\n",
-       "      <th>correctness</th>\n",
-       "      <th>score_string:accuracy</th>\n",
-       "      <th>helpfulness</th>\n",
-       "      <th>input</th>\n",
-       "      <th>output</th>\n",
-       "      <th>reference</th>\n",
+       "      <th>inputs.input</th>\n",
+       "      <th>outputs.input</th>\n",
+       "      <th>outputs.output</th>\n",
+       "      <th>reference.output</th>\n",
+       "      <th>feedback.correctness</th>\n",
+       "      <th>feedback.embedding_cosine_distance</th>\n",
+       "      <th>feedback.helpfulness</th>\n",
+       "      <th>feedback.score_string:accuracy</th>\n",
+       "      <th>feedback.not_uncertain</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>run_id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>42b639a2-17c4-4031-88a9-0ce2c45781ce</th>\n",
-       "      <td>0.317938</td>\n",
-       "      <td>0.0</td>\n",
+       "      <th>63e7ff81-b3f6-40aa-81c8-3600c504d54f</th>\n",
+       "      <td>When did langchain first announce the hub?</td>\n",
+       "      <td>When did langchain first announce the hub?</td>\n",
+       "      <td>LangChain first announced the LangChain Hub on...</td>\n",
+       "      <td>September 5, 2023</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.153003</td>\n",
+       "      <td>1</td>\n",
        "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>{'input': 'What is the langsmith cookbook?'}</td>\n",
-       "      <td>{'input': 'What is the langsmith cookbook?', '...</td>\n",
-       "      <td>{'output': 'September 5, 2023'}</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>5.250572</td>\n",
+       "      <td>e9440e33-e2c6-4bec-a190-2fa41947e9c5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>54b4fce8-4492-409d-94af-708f51698b39</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>{'input': 'Who trained Llama-v2?'}</td>\n",
-       "      <td>{'Error': 'TypeError(\"DuckDuckGoSearchResults....</td>\n",
-       "      <td>{'output': 'The langsmith cookbook is a github...</td>\n",
+       "      <th>48c2b719-93a3-47cb-baa6-ee93ecb1ba30</th>\n",
+       "      <td>What is the langsmith cookbook?</td>\n",
+       "      <td>What is the langsmith cookbook?</td>\n",
+       "      <td>The LangSmith Cookbook is a collection of reci...</td>\n",
+       "      <td>The langsmith cookbook is a github repository ...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.049442</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>10.568036</td>\n",
+       "      <td>ad1d3050-dffb-45a2-ab9d-bbc88a702682</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e</th>\n",
-       "      <td>0.138916</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>{'input': 'When was Llama-v2 released?'}</td>\n",
-       "      <td>{'input': 'When was Llama-v2 released?', 'outp...</td>\n",
-       "      <td>{'output': 'July 18, 2023'}</td>\n",
+       "      <th>96ed16c3-cbc6-4ebb-a335-ec70e64db109</th>\n",
+       "      <td>When was Llama-v2 released?</td>\n",
+       "      <td>When was Llama-v2 released?</td>\n",
+       "      <td>Llama-v2 was released in 2023.</td>\n",
+       "      <td>July 18, 2023</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.192453</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>6.199289</td>\n",
+       "      <td>646518d4-b4be-4b20-b5cb-9b86e0a8fe86</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>678c0363-3ed1-410a-811f-ebadef2e783a</th>\n",
-       "      <td>0.050158</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>{'input': 'What's LangSmith?'}</td>\n",
-       "      <td>{'input': 'What's LangSmith?', 'output': 'Lang...</td>\n",
-       "      <td>{'output': 'LangSmith is a unified platform fo...</td>\n",
+       "      <th>7342bb9c-9733-47bd-b88e-cd4ab4f4f82d</th>\n",
+       "      <td>What's LangSmith?</td>\n",
+       "      <td>What's LangSmith?</td>\n",
+       "      <td>LangSmith is a platform that helps developers ...</td>\n",
+       "      <td>LangSmith is a unified platform for debugging,...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.064186</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>5.817807</td>\n",
+       "      <td>07d157f1-a18c-4173-bda0-1b3ac8d8dcf2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>762a616c-7aab-419c-9001-b43ab6200d26</th>\n",
-       "      <td>0.098766</td>\n",
-       "      <td>0.0</td>\n",
+       "      <th>aee03fdf-c63e-4f63-9075-a90be3922c7f</th>\n",
+       "      <td>What is LangChain?</td>\n",
+       "      <td>What is LangChain?</td>\n",
+       "      <td>LangChain is a decentralized blockchain platfo...</td>\n",
+       "      <td>LangChain is an open-source framework for buil...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.092256</td>\n",
+       "      <td>0</td>\n",
        "      <td>0.1</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>{'input': 'What is LangChain?'}</td>\n",
-       "      <td>{'input': 'What is LangChain?', 'output': 'Lan...</td>\n",
-       "      <td>{'output': 'LangChain is an open-source framew...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3.858924</td>\n",
+       "      <td>bf1bc257-2ab6-464e-b6d1-99f25b042383</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                      embedding_cosine_distance  correctness  \\\n",
-       "42b639a2-17c4-4031-88a9-0ce2c45781ce                   0.317938          0.0   \n",
-       "54b4fce8-4492-409d-94af-708f51698b39                        NaN          NaN   \n",
-       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                   0.138916          1.0   \n",
-       "678c0363-3ed1-410a-811f-ebadef2e783a                   0.050158          1.0   \n",
-       "762a616c-7aab-419c-9001-b43ab6200d26                   0.098766          0.0   \n",
+       "                                                                    inputs.input  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f  When did langchain first announce the hub?   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30             What is the langsmith cookbook?   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                 When was Llama-v2 released?   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                           What's LangSmith?   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                          What is LangChain?   \n",
        "\n",
-       "                                      score_string:accuracy  helpfulness  \\\n",
-       "42b639a2-17c4-4031-88a9-0ce2c45781ce                    1.0          1.0   \n",
-       "54b4fce8-4492-409d-94af-708f51698b39                    NaN          NaN   \n",
-       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                    1.0          1.0   \n",
-       "678c0363-3ed1-410a-811f-ebadef2e783a                    1.0          1.0   \n",
-       "762a616c-7aab-419c-9001-b43ab6200d26                    0.1          0.0   \n",
+       "                                                                   outputs.input  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f  When did langchain first announce the hub?   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30             What is the langsmith cookbook?   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                 When was Llama-v2 released?   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                           What's LangSmith?   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                          What is LangChain?   \n",
        "\n",
-       "                                                                             input  \\\n",
-       "42b639a2-17c4-4031-88a9-0ce2c45781ce  {'input': 'What is the langsmith cookbook?'}   \n",
-       "54b4fce8-4492-409d-94af-708f51698b39            {'input': 'Who trained Llama-v2?'}   \n",
-       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e      {'input': 'When was Llama-v2 released?'}   \n",
-       "678c0363-3ed1-410a-811f-ebadef2e783a                {'input': 'What's LangSmith?'}   \n",
-       "762a616c-7aab-419c-9001-b43ab6200d26               {'input': 'What is LangChain?'}   \n",
+       "                                                                         outputs.output  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f  LangChain first announced the LangChain Hub on...   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30  The LangSmith Cookbook is a collection of reci...   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                     Llama-v2 was released in 2023.   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d  LangSmith is a platform that helps developers ...   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f  LangChain is a decentralized blockchain platfo...   \n",
        "\n",
-       "                                                                                 output  \\\n",
-       "42b639a2-17c4-4031-88a9-0ce2c45781ce  {'input': 'What is the langsmith cookbook?', '...   \n",
-       "54b4fce8-4492-409d-94af-708f51698b39  {'Error': 'TypeError(\"DuckDuckGoSearchResults....   \n",
-       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e  {'input': 'When was Llama-v2 released?', 'outp...   \n",
-       "678c0363-3ed1-410a-811f-ebadef2e783a  {'input': 'What's LangSmith?', 'output': 'Lang...   \n",
-       "762a616c-7aab-419c-9001-b43ab6200d26  {'input': 'What is LangChain?', 'output': 'Lan...   \n",
+       "                                                                       reference.output  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                                  September 5, 2023   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30  The langsmith cookbook is a github repository ...   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                                      July 18, 2023   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d  LangSmith is a unified platform for debugging,...   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f  LangChain is an open-source framework for buil...   \n",
        "\n",
-       "                                                                              reference  \n",
-       "42b639a2-17c4-4031-88a9-0ce2c45781ce                    {'output': 'September 5, 2023'}  \n",
-       "54b4fce8-4492-409d-94af-708f51698b39  {'output': 'The langsmith cookbook is a github...  \n",
-       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                        {'output': 'July 18, 2023'}  \n",
-       "678c0363-3ed1-410a-811f-ebadef2e783a  {'output': 'LangSmith is a unified platform fo...  \n",
-       "762a616c-7aab-419c-9001-b43ab6200d26  {'output': 'LangChain is an open-source framew...  "
+       "                                      feedback.correctness  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                     1   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30                     1   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                     1   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                     1   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                     0   \n",
+       "\n",
+       "                                      feedback.embedding_cosine_distance  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                            0.153003   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30                            0.049442   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                            0.192453   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                            0.064186   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                            0.092256   \n",
+       "\n",
+       "                                      feedback.helpfulness  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                     1   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30                     1   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                     1   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                     1   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                     0   \n",
+       "\n",
+       "                                      feedback.score_string:accuracy  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                             1.0   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30                             0.7   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                             0.7   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                             0.7   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                             0.1   \n",
+       "\n",
+       "                                      feedback.not_uncertain error  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f                       1  None   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30                       1  None   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109                       1  None   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d                       1  None   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f                       1  None   \n",
+       "\n",
+       "                                      execution_time  \\\n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f        5.250572   \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30       10.568036   \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109        6.199289   \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d        5.817807   \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f        3.858924   \n",
+       "\n",
+       "                                                                    run_id  \n",
+       "63e7ff81-b3f6-40aa-81c8-3600c504d54f  e9440e33-e2c6-4bec-a190-2fa41947e9c5  \n",
+       "48c2b719-93a3-47cb-baa6-ee93ecb1ba30  ad1d3050-dffb-45a2-ab9d-bbc88a702682  \n",
+       "96ed16c3-cbc6-4ebb-a335-ec70e64db109  646518d4-b4be-4b20-b5cb-9b86e0a8fe86  \n",
+       "7342bb9c-9733-47bd-b88e-cd4ab4f4f82d  07d157f1-a18c-4173-bda0-1b3ac8d8dcf2  \n",
+       "aee03fdf-c63e-4f63-9075-a90be3922c7f  bf1bc257-2ab6-464e-b6d1-99f25b042383  "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -684,7 +1011,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 35,
    "id": "5eeb023f-ded2-4d0f-b910-2a57d9675853",
    "metadata": {},
    "outputs": [
@@ -692,16 +1019,239 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "View the evaluation results for project 'runnable-agent-test-39f3bbd0-bf2162aa' at:\n",
-      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/fa721ccc-dd0f-41c9-bf80-22215c44efd4\n",
-      "[------------------------------------------------->] 5/5\n",
-      " Eval quantiles:\n",
-      "                               0.25       0.5      0.75      mean      mode\n",
-      "embedding_cosine_distance  0.059506  0.155538  0.212864  0.157915  0.043119\n",
-      "correctness                0.000000  0.000000  1.000000  0.400000  0.000000\n",
-      "score_string:accuracy      0.700000  1.000000  1.000000  0.880000  1.000000\n",
-      "helpfulness                1.000000  1.000000  1.000000  0.800000  1.000000\n"
+      "View the evaluation results for project 'runnable-agent-test-39f3bbd0-97e1' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/14d8a382-3c0f-48e7-b212-33489ee8a13e/compare?selectedSessions=7753a05e-8235-4bc2-a227-d0622c1a36a4\n",
+      "\n",
+      "View all tests for Dataset agent-qa-e2d24144 at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/14d8a382-3c0f-48e7-b212-33489ee8a13e\n",
+      "[------------------------------------------------->] 5/5"
      ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h3>Experiment Results:</h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feedback.correctness</th>\n",
+       "      <th>feedback.embedding_cosine_distance</th>\n",
+       "      <th>feedback.helpfulness</th>\n",
+       "      <th>feedback.score_string:accuracy</th>\n",
+       "      <th>feedback.not_uncertain</th>\n",
+       "      <th>error</th>\n",
+       "      <th>execution_time</th>\n",
+       "      <th>run_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>c7128691-de05-41f4-8ffa-ddd12a640ba9</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.800000</td>\n",
+       "      <td>0.119282</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.820000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7.655640</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.447214</td>\n",
+       "      <td>0.080145</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.216795</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.665656</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.043368</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.409900</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.053311</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.700000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.633545</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.107826</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.900000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6.451075</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.153003</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9.194739</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.238903</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.588943</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        feedback.correctness  feedback.embedding_cosine_distance  \\\n",
+       "count               5.000000                            5.000000   \n",
+       "unique                   NaN                                 NaN   \n",
+       "top                      NaN                                 NaN   \n",
+       "freq                     NaN                                 NaN   \n",
+       "mean                0.800000                            0.119282   \n",
+       "std                 0.447214                            0.080145   \n",
+       "min                 0.000000                            0.043368   \n",
+       "25%                 1.000000                            0.053311   \n",
+       "50%                 1.000000                            0.107826   \n",
+       "75%                 1.000000                            0.153003   \n",
+       "max                 1.000000                            0.238903   \n",
+       "\n",
+       "        feedback.helpfulness  feedback.score_string:accuracy  \\\n",
+       "count                    5.0                        5.000000   \n",
+       "unique                   NaN                             NaN   \n",
+       "top                      NaN                             NaN   \n",
+       "freq                     NaN                             NaN   \n",
+       "mean                     1.0                        0.820000   \n",
+       "std                      0.0                        0.216795   \n",
+       "min                      1.0                        0.500000   \n",
+       "25%                      1.0                        0.700000   \n",
+       "50%                      1.0                        0.900000   \n",
+       "75%                      1.0                        1.000000   \n",
+       "max                      1.0                        1.000000   \n",
+       "\n",
+       "        feedback.not_uncertain error  execution_time  \\\n",
+       "count                      5.0     0        5.000000   \n",
+       "unique                     NaN     0             NaN   \n",
+       "top                        NaN   NaN             NaN   \n",
+       "freq                       NaN   NaN             NaN   \n",
+       "mean                       1.0   NaN        7.655640   \n",
+       "std                        0.0   NaN        2.665656   \n",
+       "min                        1.0   NaN        5.409900   \n",
+       "25%                        1.0   NaN        5.633545   \n",
+       "50%                        1.0   NaN        6.451075   \n",
+       "75%                        1.0   NaN        9.194739   \n",
+       "max                        1.0   NaN       11.588943   \n",
+       "\n",
+       "                                      run_id  \n",
+       "count                                      5  \n",
+       "unique                                     5  \n",
+       "top     c7128691-de05-41f4-8ffa-ddd12a640ba9  \n",
+       "freq                                       1  \n",
+       "mean                                     NaN  \n",
+       "std                                      NaN  \n",
+       "min                                      NaN  \n",
+       "25%                                      NaN  \n",
+       "50%                                      NaN  \n",
+       "75%                                      NaN  \n",
+       "max                                      NaN  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -709,21 +1259,24 @@
     "\n",
     "chain_results = run_on_dataset(\n",
     "    dataset_name=dataset_name,\n",
-    "    llm_or_chain_factory=functools.partial(agent_factory, prompt=candidate_prompt),\n",
+    "    llm_or_chain_factory=functools.partial(\n",
+    "        create_agent, prompt=candidate_prompt, llm_with_tools=llm_with_tools\n",
+    "    ),\n",
     "    evaluation=evaluation_config,\n",
     "    verbose=True,\n",
     "    client=client,\n",
     "    project_name=f\"runnable-agent-test-39f3bbd0-{unique_id}\",\n",
-    "    tags=[\n",
-    "        \"testing-notebook\",\n",
-    "        \"prompt:39f3bbd0\",\n",
-    "    ],  # Optional, adds a tag to the resulting chain runs\n",
+    "    project_metadata={\n",
+    "        \"env\": \"testing-notebook\",\n",
+    "        \"model\": \"gpt-3.5-turbo\",\n",
+    "        \"prompt\": \"39f3bbd0\",\n",
+    "    },\n",
     ")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "591c819e-9932-45cf-adab-63727dd49559",
+   "id": "9fafd1dd-debf-4256-a609-a6b3a7c52c49",
    "metadata": {},
    "source": [
     "## Exporting datasets and runs\n",
@@ -735,7 +1288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 37,
    "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
    "metadata": {
     "tags": []
@@ -747,12 +1300,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 38,
    "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'correctness': {'n': 5, 'avg': 0.8},\n",
+       " 'embedding_cosine_distance': {'n': 5, 'avg': 0.11926},\n",
+       " 'helpfulness': {'n': 5, 'avg': 1.0},\n",
+       " 'not_uncertain': {'n': 5, 'avg': 1.0},\n",
+       " 'score_string:accuracy': {'n': 5, 'avg': 0.82}}"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# After some time, these will be populated.\n",
     "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
@@ -791,7 +1359,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.11.2"
   }
  },
  "nbformat": 4,