add custom

Merge with check
Add better warnings
2026-02-04 00:00:34 +00:00 · 2023-07-03 21:41:11 -07:00 · 2023-07-03 20:31:50 -07:00 · 2023-07-03 17:35:33 -07:00 · 2023-07-03 16:22:44 -07:00 · 2023-07-03 16:06:23 -07:00
51 changed files with 3061 additions and 5848 deletions
--- a/docs/api_reference/modules/evaluation.rst
+++ b/docs/api_reference/modules/evaluation.rst
@@ -0,0 +1,9 @@
+Evaluation
+=======================
+
+LangChain has a number of convenient evaluation chains you can use off the shelf to grade your models' oupputs.
+
+.. automodule:: langchain.evaluation
+   :members:
+   :undoc-members:
+   :inherited-members:
--- a/docs/docs_skeleton/docs/modules/evaluation/comparison/index.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/comparison/index.mdx
@@ -0,0 +1,8 @@
+---
+sidebar_position: 3 
+---
+# Comparison
+
+import DocCardList from "@theme/DocCardList";
+
+<DocCardList />
--- a/docs/docs_skeleton/docs/modules/evaluation/comparison/pairwise_string.ipynb
+++ b/docs/docs_skeleton/docs/modules/evaluation/comparison/pairwise_string.ipynb
@@ -0,0 +1,148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2da95378",
+   "metadata": {},
+   "source": [
+    "# Pairwise String Comparison\n",
+    "\n",
+    "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The comparison evaluators facilitate this so you can answer questions like:\n",
+    "- Which LLM or Prompt produces a preferred output for a given question?\n",
+    "- Which completions should I include for few-shot example selection?\n",
+    "- Which output is better to include for fintetuning?\n",
+    "\n",
+    "You can use the PairwiseStringEvalChain to do this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f6790c46",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import PairwiseStringEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
+    "\n",
+    "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "49ad9139",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference. Although Response B is less detailed, it is accurate and directly answers the question. \\n\\nTherefore, the better response is [[B]].\\n',\n",
+       " 'value': 'B',\n",
+       " 'score': 0}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_string_pairs(\n",
+    "    prediction = \"there are three dogs\",\n",
+    "    prediction_b=\"4\",\n",
+    "    input=\"how many dogs are in the park?\",\n",
+    "    reference=\"four\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed353b93-be71-4479-b9c0-8c97814c2e58",
+   "metadata": {},
+   "source": [
+    "## Without References\n",
+    "\n",
+    "When references aren't available, you can still predict the preferred response.\n",
+    "The results will reflect the evaluation model's preference, which is less reliable and may result\n",
+    "in preferences that are factually incorrect."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "586320da",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'Both responses answer the question directly and accurately, but neither provides any additional detail or context. Response A is slightly more complete because it uses a full sentence, while Response B only provides a number. However, both responses are relevant and accurate, so the difference is minimal.\\n\\nFinal decision: [[C]]\\n',\n",
+       " 'value': None,\n",
+       " 'score': 0.5}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_string_pairs(\n",
+    "    prediction = \"there are three dogs\",\n",
+    "    prediction_b=\"4\",\n",
+    "    input=\"What is the name of the dog?\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de84a958-1330-482b-b950-68bcf23f9e35",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/docs_skeleton/docs/modules/evaluation/comparison/pairwise_string.md
+++ b/docs/docs_skeleton/docs/modules/evaluation/comparison/pairwise_string.md
@@ -0,0 +1,70 @@
+# Pairwise String Comparison
+
+Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The comparison evaluators facilitate this so you can answer questions like:
+- Which LLM or Prompt produces a preferred output for a given question?
+- Which completions should I include for few-shot example selection?
+- Which output is better to include for fintetuning?
+
+You can use the PairwiseStringEvalChain to do this.
+
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
+
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import PairwiseStringEvalChain
+
+llm = ChatOpenAI(model="gpt-4", temperature=0.0)
+
+eval_chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)
+```
+
+
+```python
+eval_chain.evaluate_string_pairs(
+    prediction = "there are three dogs",
+    prediction_b="4",
+    input="how many dogs are in the park?",
+    reference="four"
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference. Although Response B is less detailed, it is accurate and directly answers the question. \n\nTherefore, the better response is [[B]].\n',
+     'value': 'B',
+     'score': 0}
+```
+
+</CodeOutputBlock>
+
+## Without References
+
+When references aren't available, you can still predict the preferred response.
+The results will reflect the evaluation model's preference, which is less reliable and may result
+in preferences that are factually incorrect.
+
+
+```python
+eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)
+```
+
+
+```python
+eval_chain.evaluate_string_pairs(
+    prediction = "there are three dogs",
+    prediction_b="4",
+    input="What is the name of the dog?",
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'Both responses answer the question directly and accurately, but neither provides any additional detail or context. Response A is slightly more complete because it uses a full sentence, while Response B only provides a number. However, both responses are relevant and accurate, so the difference is minimal.\n\nFinal decision: [[C]]\n',
+     'value': None,
+     'score': 0.5}
+```
+
+</CodeOutputBlock>
--- a/docs/docs_skeleton/docs/modules/evaluation/how_to/custom_evaluator.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/custom_evaluator.mdx
@@ -0,0 +1,4 @@
+---
+sidebar_position: 3
+---
+# Custom Evaluator
--- a/docs/docs_skeleton/docs/modules/evaluation/how_to/generating_examples.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/generating_examples.mdx
@@ -0,0 +1,6 @@
+---
+sidebar_position: 2
+---
+
+# Generating Examples
+
--- a/docs/docs_skeleton/docs/modules/evaluation/how_to/index.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/index.mdx
@@ -0,0 +1,8 @@
+---
+sidebar_position: 5 
+---
+# How To
+
+import DocCardList from "@theme/DocCardList";
+
+<DocCardList />
--- a/docs/docs_skeleton/docs/modules/evaluation/how_to/regression_testing.ipynb
+++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/regression_testing.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0fedc3eb-58d3-4001-9d52-699905aed710",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Regression Testing\n",
+    "\n",
+    "When dealing with model API's, it can be hard to know if the prediction quality has changed without proper regression testing. This guide will touch on three easy ways\n",
+    "to regression test your model API's. We will use a QA system as an example. They all depend on constructing a dataset of inputs. It's best for inputs to be representative of your application domain.\n",
+    "\n",
+    "**Important:** As with any system, it's important to isolate what you want to test. If you are regression testing an LLM API, test it directly or mock other components of your application."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c66c2025-8569-4955-a50a-bb66bd39413e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.loading import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8095377-7751-4d1b-8303-051a48adc6c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = []"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b690d689-b338-4d74-8dbc-9debaaa6725d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Approach 1: Compare Aggregate Performance\n",
+    "\n",
+    "The first approach is to construct an example dataset with reference examples. You can test the accuracy (or other metrics) of your model on a schedule to ensure the accuracy of your model is not degrading."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ee582f1-de66-4544-99ef-3bf672c13a05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import  ChatOpenAI\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0631\", temperature=0)\n",
+    "# TODO"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7562c310-d80b-4461-96e0-d70bc94b3e9a",
+   "metadata": {},
+   "source": [
+    "## Approach 2: Pairwise Compare Outputs\n",
+    "\n",
+    "The second way you can track changes and regressions is to compare outputs of the model on identical inputs. You can use a simple exact (or fuzzy) string match metric\n",
+    "or use a model graded metric to ensure the meanings of the outputs are the same.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f47bdef5-7202-4523-b207-c0b6a7dd6da5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/docs_skeleton/docs/modules/evaluation/how_to/regression_testing.md
+++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/regression_testing.md
@@ -0,0 +1,41 @@
+# Regression Testing
+
+When dealing with model API's, it can be hard to know if the prediction quality has changed without proper regression testing. This guide will touch on three easy ways
+to regression test your model API's. We will use a QA system as an example. They all depend on constructing a dataset of inputs. It's best for inputs to be representative of your application domain.
+
+**Important:** As with any system, it's important to isolate what you want to test. If you are regression testing an LLM API, test it directly or mock other components of your application.
+
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
+
+
+```python
+from langchain.evaluation.loading import load_dataset
+```
+
+
+```python
+inputs = []
+```
+
+
+## Approach 1: Compare Aggregate Performance
+
+The first approach is to construct an example dataset with reference examples. You can test the accuracy (or other metrics) of your model on a schedule to ensure the accuracy of your model is not degrading.
+
+
+```python
+from langchain.chat_models import  ChatOpenAI
+llm = ChatOpenAI(model="gpt-3.5-turbo-0631", temperature=0)
+# TODO
+```
+
+## Approach 2: Pairwise Compare Outputs
+
+The second way you can track changes and regressions is to compare outputs of the model on identical inputs. You can use a simple exact (or fuzzy) string match metric
+or use a model graded metric to ensure the meanings of the outputs are the same.
+
+
+
+```python
+# TODO
+```
--- a/docs/docs_skeleton/docs/modules/evaluation/index.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/index.mdx
@@ -0,0 +1,13 @@
+---
+sidebar_position: 1
+---
+
+# Evaluation
+
+Blah Blah Blah TODO
+
+Different types of evaluators:
+
+- [String Evaluators](/docs/modules/evaluation/string/): Evaluators that evaluate the predicted strings for a single run
+- [Trajectory Evaluators](/docs/modules/evaluation/trajectory/): Evaluators that evaluate the whole trajectory of agent actions
+- [Comparison Evaluators](/docs/modules/evaluation/comparison/): Evaluators that compare predictions from two runs
--- a/docs/docs_skeleton/docs/modules/evaluation/string/criteria_eval_chain.ipynb
+++ b/docs/docs_skeleton/docs/modules/evaluation/string/criteria_eval_chain.ipynb
@@ -10,11 +10,11 @@
    "Suppose you want to test a model's output against a custom rubric or custom set of criteria, how would you go about testing this?\n",
    "\n",
    "The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
-    "describe those criteria in regular language. In this example, you will use the `CriteriaEvalChain` to check whether an output is concise.\n",
+    "properly define those criteria.\n",
    "\n",
-    "### Step 1: Create the Eval Chain\n",
+    "### Without References\n",
    "\n",
-    "First, create the evaluation chain to predict whether outputs are \"concise\"."
+    "In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are \"concise\"."
   ]
  },
  {
@@ -29,48 +29,14 @@
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.evaluation.criteria import CriteriaEvalChain\n",
    "\n",
-    "llm = ChatOpenAI(temperature=0)\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
    "criterion = \"conciseness\"\n",
    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "eaef0d93-e080-4be2-a0f1-701b0d91fcf4",
-   "metadata": {},
-   "source": [
-    "### Step 2: Make Prediction\n",
-    "\n",
-    "Run an output to measure."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 2,
-   "id": "68b1a348-cf41-40bf-9667-e79683464cf2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "llm = ChatOpenAI(temperature=0)\n",
-    "query=\"What's the origin of the term synecdoche?\"\n",
-    "prediction = llm.predict(query)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f45ed40e-09c4-44dc-813d-63a4ffb2d2ea",
-   "metadata": {},
-   "source": [
-    "### Step 3: Evaluate Prediction\n",
-    "\n",
-    "Determine whether the prediciton conforms to the criteria."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
   "id": "22f83fb8-82f4-4310-a877-68aaa0789199",
   "metadata": {
    "tags": []
@@ -80,18 +46,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'reasoning': '1. Conciseness: The submission is concise and to the point. It directly answers the question without any unnecessary information. Therefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n"
+      "{'reasoning': 'The criterion is conciseness, which means the submission should be concise and to the point. \\n\\nLooking at the submission, the respondent has added unnecessary information such as \"That\\'s an elementary question\" and \"The answer you\\'re looking for is that\". The actual answer to the question \"What\\'s 2+2?\" is simply \"4\". \\n\\nTherefore, the submission is not concise and does not meet the criterion.\\n\\nN', 'value': 'N', 'score': 0}\n"
     ]
    }
   ],
   "source": [
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
+    "    input=\"What's 2+2?\",\n",
+    ")\n",
    "print(eval_result)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
   "metadata": {
    "tags": []
@@ -113,7 +82,7 @@
       " 'insensitive']"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -128,14 +97,14 @@
   "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
   "metadata": {},
   "source": [
-    "## Requiring Reference Labels\n",
+    "## Using Reference Labels\n",
    "\n",
    "Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
   "metadata": {
    "tags": []
@@ -165,7 +134,7 @@
    "    input=\"What is the capital of the US?\",\n",
    "    prediction=\"Topeka, KS\", \n",
    ")\n",
-    "print(f'Withoutg ground truth: {eval_result[\"score\"]}')"
+    "print(f'Without ground truth: {eval_result[\"score\"]}')"
   ]
  },
  {
@@ -192,14 +161,17 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'reasoning': 'Conciseness:\\n- The submission is one sentence long, which is concise.\\n- The submission directly answers the question without any unnecessary information.\\nConclusion: The submission meets the conciseness criterion.\\n\\nCoherence:\\n- The submission is well-structured and organized.\\n- The submission provides the origin of the term synecdoche and explains the meaning of the Greek words it comes from.\\n- The submission is coherent and easy to understand.\\nConclusion: The submission meets the coherence criterion.', 'value': 'Final conclusion: Y', 'score': None}\n"
+      "{'reasoning': \"First, let's assess the submission based on the criterion of conciseness. The submission is not concise and to the point. The first part of the answer is correct, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and adds unnecessary confusion, making the answer not concise.\\n\\nSecond, let's evaluate the submission based on the criterion of coherence. The submission is not coherent, well-structured, and organized. The first part of the answer is coherent and well-structured, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and disrupts the coherence and structure of the answer.\\n\\nBased on the assessment of the submission against the criteria, the submission does not meet all the criteria.\\n\\nN\", 'value': 'N', 'score': 0}\n"
     ]
    }
   ],
   "source": [
    "criteria = [\"conciseness\", \"coherence\"]\n",
    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)\n",
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"The capital of the US is Washington D.C. There is no capital.\", \n",
+    "    input=\"What is the capital of the US?\",\n",
+    ")\n",
    "print(eval_result)"
   ]
  },
@@ -217,7 +189,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
   "id": "bafa0a11-2617-4663-84bf-24df7d0736be",
   "metadata": {},
   "outputs": [
@@ -225,7 +197,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'reasoning': '1. Criteria: numeric: Does the output contain numeric information?\\n- The submission does not contain any numeric information.\\n- Conclusion: The submission meets the criteria.', 'value': 'Answer: Y', 'score': None}\n"
+      "{'reasoning': 'The criterion asks if the output contains numeric information. The submission states \"The closest star is more than four light years away.\" The phrase \"more than four\" includes a numeric value, which is \"four\". Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
     ]
    }
   ],
@@ -235,13 +207,16 @@
    "}\n",
    "\n",
    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n",
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"The closest star is more than four light years away.\", \n",
+    "    input=\"How far away is the closest star?\",\n",
+    ")\n",
    "print(eval_result)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
   "id": "6db12a16-0058-4a14-8064-8528540963d8",
   "metadata": {
    "tags": []
@@ -290,7 +265,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
   "id": "99e3c242-5b12-4bd5-b487-64990a159655",
   "metadata": {},
   "outputs": [
@@ -316,7 +291,7 @@
       "  ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -330,7 +305,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 13,
   "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
   "metadata": {
    "tags": []
@@ -339,12 +314,12 @@
    {
     "data": {
      "text/plain": [
-       "{'reasoning': 'harmful1: The submission includes a derogatory and insulting term (\"lilly-livered nincompoop\") to describe Will, which could be considered harmful and disrespectful. Therefore, the submission meets this criterion.\\nharmful2: The submission does not contain any explicit encouragement of dangerous or illegal activity, nor does it contain any explicit racism or sexism. However, the use of the term \"nincompoop\" could be seen as belittling and harmful to Will\\'s self-esteem. Therefore, the submission meets this criterion as well.\\n\\nY',\n",
-       " 'value': 'Y',\n",
-       " 'score': 1}"
+       "{'reasoning': 'Let\\'s assess the submission based on the provided criteria:\\n\\nCriterion harmful1: The assistant\\'s response is not harmful, unethical, racist, sexist, toxic, dangerous, or illegal. It is a negative opinion about \"Will\", but it does not cross any of these boundaries. It is not promoting harm or illegal activities, nor is it expressing any form of discrimination or bias.\\n\\nCriterion harmful2: The assistant\\'s response could potentially be harmful to \"Will\" if he takes the insult personally. However, it does not encourage dangerous or illegal activity, nor does it promote racism, sexism, or other social biases.\\n\\nBased on this analysis, the submission does not fully meet the criteria because it could potentially be harmful to \"Will\". However, it does not violate any of the other specific harmful behaviors listed in the criteria.\\n\\nN',\n",
+       " 'value': 'N',\n",
+       " 'score': 0}"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -392,7 +367,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.2"
  }
 },
 "nbformat": 4,
--- a/docs/docs_skeleton/docs/modules/evaluation/string/criteria_eval_chain.md
+++ b/docs/docs_skeleton/docs/modules/evaluation/string/criteria_eval_chain.md
@@ -0,0 +1,236 @@
+# Evaluating Custom Criteria
+
+Suppose you want to test a model's output against a custom rubric or custom set of criteria, how would you go about testing this?
+
+The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can
+properly define those criteria.
+
+### Without References
+
+In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are "concise".
+
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
+
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation.criteria import CriteriaEvalChain
+
+llm = ChatOpenAI(model="gpt-4", temperature=0)
+criterion = "conciseness"
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)
+```
+
+
+```python
+eval_result = eval_chain.evaluate_strings(
+    prediction="What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.",
+    input="What's 2+2?",
+)
+print(eval_result)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'The criterion is conciseness, which means the submission should be concise and to the point. \n\nLooking at the submission, the respondent has added unnecessary information such as "That\'s an elementary question" and "The answer you\'re looking for is that". The actual answer to the question "What\'s 2+2?" is simply "4". \n\nTherefore, the submission is not concise and does not meet the criterion.\n\nN', 'value': 'N', 'score': 0}
+```
+
+</CodeOutputBlock>
+
+
+```python
+# For a list of other default supported criteria, try calling `supported_default_criteria`
+CriteriaEvalChain.get_supported_default_criteria()
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    ['conciseness',
+     'relevance',
+     'correctness',
+     'coherence',
+     'harmfulness',
+     'maliciousness',
+     'helpfulness',
+     'controversiality',
+     'mysogyny',
+     'criminality',
+     'insensitive']
+```
+
+</CodeOutputBlock>
+
+## Using Reference Labels
+
+Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well.
+
+
+```python
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria="correctness", requires_reference=True)
+
+# We can even override the model's learned knowledge using ground truth labels
+eval_result = eval_chain.evaluate_strings(
+    input="What is the capital of the US?",
+    prediction="Topeka, KS", 
+    reference="The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023")
+print(f'With ground truth: {eval_result["score"]}')
+
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria="correctness")
+eval_result = eval_chain.evaluate_strings(
+    input="What is the capital of the US?",
+    prediction="Topeka, KS", 
+)
+print(f'Without ground truth: {eval_result["score"]}')
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    With ground truth: 1
+    Withoutg ground truth: 0
+```
+
+</CodeOutputBlock>
+
+## Multiple Criteria
+
+To check whether an output complies with all of a list of default criteria, pass in a list! Be sure to only include criteria that are relevant to the provided information, and avoid mixing criteria that measure opposing things (e.g., harmfulness and helpfulness)
+
+
+```python
+criteria = ["conciseness", "coherence"]
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
+eval_result = eval_chain.evaluate_strings(
+    prediction="The capital of the US is Washington D.C. There is no capital.", 
+    input="What is the capital of the US?",
+)
+print(eval_result)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': "First, let's assess the submission based on the criterion of conciseness. The submission is not concise and to the point. The first part of the answer is correct, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and adds unnecessary confusion, making the answer not concise.\n\nSecond, let's evaluate the submission based on the criterion of coherence. The submission is not coherent, well-structured, and organized. The first part of the answer is coherent and well-structured, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and disrupts the coherence and structure of the answer.\n\nBased on the assessment of the submission against the criteria, the submission does not meet all the criteria.\n\nN", 'value': 'N', 'score': 0}
+```
+
+</CodeOutputBlock>
+
+## Custom Criteria
+
+To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `"criterion_name": "criterion_description"`
+
+Note: the evaluator still predicts whether the output complies with ALL of the criteria provided. If you specify antagonistic criteria / antonyms, the evaluator won't be very useful.
+
+
+```python
+custom_criterion = {
+    "numeric": "Does the output contain numeric information?"
+}
+
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)
+eval_result = eval_chain.evaluate_strings(
+    prediction="The closest star is more than four light years away.", 
+    input="How far away is the closest star?",
+)
+print(eval_result)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'The criterion asks if the output contains numeric information. The submission states "The closest star is more than four light years away." The phrase "more than four" includes a numeric value, which is "four". Therefore, the submission meets the criterion.\n\nY', 'value': 'Y', 'score': 1}
+```
+
+</CodeOutputBlock>
+
+
+```python
+# You can specify multiple criteria in the dictionary. We recommend you keep the number criteria to a minimum, however for more reliable results.
+
+custom_criteria = {
+    "complements-user": "Does the submission complements the question or the person writing the question in some way?",
+    "positive": "Does the submission maintain a positive sentiment throughout?",
+    "active voice": "Does the submission maintain an active voice throughout, avoiding state of being verbs?",
+}
+
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criteria)
+
+# Example that complies
+query = "What's the population of lagos?"
+eval_result = eval_chain.evaluate_strings(prediction="I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.", input=query)
+print("Meets criteria: ", eval_result["score"])
+
+# Example that does not comply
+eval_result = eval_chain.evaluate_strings(prediction="The population of Lagos, Nigeria, is about 30 million people.", input=query)
+print("Does not meet criteria: ", eval_result["score"])
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    Meets criteria:  1
+    Does not meet criteria:  0
+```
+
+</CodeOutputBlock>
+
+## Using Constitutional Principles
+
+Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to
+instantiate the chain and take advantage of the many existing principles in LangChain.
+
+
+```python
+from langchain.chains.constitutional_ai.principles import PRINCIPLES
+
+print(f"{len(PRINCIPLES)} available principles")
+list(PRINCIPLES.items())[:5]
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    54 available principles
+
+
+
+
+
+    [('harmful1',
+      ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),
+     ('harmful2',
+      ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),
+     ('harmful3',
+      ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),
+     ('harmful4',
+      ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),
+     ('insensitive',
+      ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]
+```
+
+</CodeOutputBlock>
+
+
+```python
+eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=[PRINCIPLES["harmful1"], PRINCIPLES["harmful2"]])
+eval_result = eval_chain.evaluate_strings(prediction="I say that man is a lilly-livered nincompoop", input="What do you think of Will?")
+eval_result
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'Let\'s assess the submission based on the provided criteria:\n\nCriterion harmful1: The assistant\'s response is not harmful, unethical, racist, sexist, toxic, dangerous, or illegal. It is a negative opinion about "Will", but it does not cross any of these boundaries. It is not promoting harm or illegal activities, nor is it expressing any form of discrimination or bias.\n\nCriterion harmful2: The assistant\'s response could potentially be harmful to "Will" if he takes the insult personally. However, it does not encourage dangerous or illegal activity, nor does it promote racism, sexism, or other social biases.\n\nBased on this analysis, the submission does not fully meet the criteria because it could potentially be harmful to "Will". However, it does not violate any of the other specific harmful behaviors listed in the criteria.\n\nN',
+     'value': 'N',
+     'score': 0}
+```
+
+</CodeOutputBlock>
+
+## Conclusion
+
+In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.
+
+Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like "correctness" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense.
--- a/docs/docs_skeleton/docs/modules/evaluation/string/index.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/string/index.mdx
@@ -0,0 +1,8 @@
+---
+sidebar_position: 2 
+---
+# String Evaluators
+
+import DocCardList from "@theme/DocCardList";
+
+<DocCardList />
--- a/docs/docs_skeleton/docs/modules/evaluation/string/qa.ipynb
+++ b/docs/docs_skeleton/docs/modules/evaluation/string/qa.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c701fcaf-e5dc-42a2-b8a7-027d13ff465f",
+   "metadata": {},
+   "source": [
+    "# QA Correctness\n",
+    "\n",
+    "The QAEvalChain compares a question-answering model's response to a reference response.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9672fdb9-b53f-41e4-8f72-f21d11edbeac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import QAEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "criterion = \"conciseness\"\n",
+    "eval_chain = QAEvalChain.from_llm(llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b4db474a-9c9d-473f-81b1-55070ee584a6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"What's last quarter's sales numbers?\",\n",
+    "    prediction=\"Last quarter we sold 600,000 total units of product.\",\n",
+    "    reference=\"Last quarter we sold 100,000 units of product A, 200,000 units of product B, and 300,000 units of product C.\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5b345aa-7f45-4eea-bedf-9b0d5e824be3",
+   "metadata": {},
+   "source": [
+    "## SQL Correctness\n",
+    "\n",
+    "You can use an LLM to check the equivalence of a SQL query against a reference SQL query. using the sql prompt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6c803b8c-fe1f-4fb7-8ea0-d9c67b855eb3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.qa.eval_prompt import SQL_PROMPT\n",
+    "\n",
+    "eval_chain = QAEvalChain.from_llm(llm=llm, prompt=SQL_PROMPT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e28b8d07-248f-405c-bcef-e0ebe3a05c3e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'The expert answer and the submission are very similar in their approach to solving the problem. Both queries are trying to calculate the sum of sales from the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\\n\\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\\n\\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of sales from the last quarter.\\n\\nCORRECT',\n",
+       " 'value': 'CORRECT',\n",
+       " 'score': 1}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"What's last quarter's sales numbers?\",\n",
+    "    prediction=\"\"\"SELECT SUM(sale_amount) AS last_quarter_sales\n",
+    "FROM sales\n",
+    "WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE();\n",
+    "\"\"\",\n",
+    "    reference=\"\"\"SELECT SUM(sub.sale_amount) AS last_quarter_sales\n",
+    "FROM (\n",
+    "    SELECT sale_amount\n",
+    "    FROM sales\n",
+    "    WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE()\n",
+    ") AS sub;\n",
+    "\"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0c3dcad-408e-4d26-9e25-848ebacac2c4",
+   "metadata": {},
+   "source": [
+    "## Using Context\n",
+    "\n",
+    "Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the ContextQAEvalChain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9f3ae116-3a2f-461d-ba6f-7352b42c1b0c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.evaluation import ContextQAEvalChain\n",
+    "\n",
+    "eval_chain = ContextQAEvalChain.from_llm(llm=llm)\n",
+    "\n",
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"Who won the NFC championship game in 2023?\",\n",
+    "    prediction=\"Eagles\",\n",
+    "    reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba5eac17-08b6-4e4f-a896-79e7fc637018",
+   "metadata": {},
+   "source": [
+    "## CoT With Context\n",
+    "\n",
+    "The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.\n",
+    "The `CotQAEvalChain`'s default prompt instructs the model to do this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "26e3b686-98f4-45a5-9854-7071ec2893f1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'The context states that the Philadelphia Eagles won the NFC championship game in 2023. The student\\'s answer, \"Eagles,\" matches the team that won according to the context. Therefore, the student\\'s answer is correct.',\n",
+       " 'value': 'CORRECT',\n",
+       " 'score': 1}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.evaluation import CotQAEvalChain\n",
+    "\n",
+    "eval_chain = CotQAEvalChain.from_llm(llm=llm)\n",
+    "\n",
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"Who won the NFC championship game in 2023?\",\n",
+    "    prediction=\"Eagles\",\n",
+    "    reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/docs_skeleton/docs/modules/evaluation/string/qa.md
+++ b/docs/docs_skeleton/docs/modules/evaluation/string/qa.md
@@ -0,0 +1,125 @@
+# QA Correctness
+
+The QAEvalChain compares a question-answering model's response to a reference response.
+
+
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
+
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import QAEvalChain
+
+llm = ChatOpenAI(model="gpt-4", temperature=0)
+criterion = "conciseness"
+eval_chain = QAEvalChain.from_llm(llm=llm)
+```
+
+
+```python
+eval_chain.evaluate_strings(
+    input="What's last quarter's sales numbers?",
+    prediction="Last quarter we sold 600,000 total units of product.",
+    reference="Last quarter we sold 100,000 units of product A, 200,000 units of product B, and 300,000 units of product C.",
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': None, 'value': 'CORRECT', 'score': 1}
+```
+
+</CodeOutputBlock>
+
+## SQL Correctness
+
+You can use an LLM to check the equivalence of a SQL query against a reference SQL query. using the sql prompt.
+
+
+```python
+from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
+
+eval_chain = QAEvalChain.from_llm(llm=llm, prompt=SQL_PROMPT)
+```
+
+
+```python
+eval_chain.evaluate_strings(
+    input="What's last quarter's sales numbers?",
+    prediction="""SELECT SUM(sale_amount) AS last_quarter_sales
+FROM sales
+WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE();
+""",
+    reference="""SELECT SUM(sub.sale_amount) AS last_quarter_sales
+FROM (
+    SELECT sale_amount
+    FROM sales
+    WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE()
+) AS sub;
+"""
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'The expert answer and the submission are very similar in their approach to solving the problem. Both queries are trying to calculate the sum of sales from the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\n\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\n\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of sales from the last quarter.\n\nCORRECT',
+     'value': 'CORRECT',
+     'score': 1}
+```
+
+</CodeOutputBlock>
+
+## Using Context
+
+Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the ContextQAEvalChain.
+
+
+```python
+from langchain.evaluation import ContextQAEvalChain
+
+eval_chain = ContextQAEvalChain.from_llm(llm=llm)
+
+eval_chain.evaluate_strings(
+    input="Who won the NFC championship game in 2023?",
+    prediction="Eagles",
+    reference="NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7",
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': None, 'value': 'CORRECT', 'score': 1}
+```
+
+</CodeOutputBlock>
+
+## CoT With Context
+
+The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.
+The `CotQAEvalChain`'s default prompt instructs the model to do this.
+
+
+```python
+from langchain.evaluation import CotQAEvalChain
+
+eval_chain = CotQAEvalChain.from_llm(llm=llm)
+
+eval_chain.evaluate_strings(
+    input="Who won the NFC championship game in 2023?",
+    prediction="Eagles",
+    reference="NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7",
+)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    {'reasoning': 'The context states that the Philadelphia Eagles won the NFC championship game in 2023. The student\'s answer, "Eagles," matches the team that won according to the context. Therefore, the student\'s answer is correct.',
+     'value': 'CORRECT',
+     'score': 1}
+```
+
+</CodeOutputBlock>
--- a/docs/docs_skeleton/docs/modules/evaluation/trajectory/index.mdx
+++ b/docs/docs_skeleton/docs/modules/evaluation/trajectory/index.mdx
@@ -0,0 +1,8 @@
+---
+sidebar_position: 4
+---
+# Agent Trajectory
+
+import DocCardList from "@theme/DocCardList";
+
+<DocCardList />
--- a/docs/docs_skeleton/docs/modules/evaluation/trajectory/trajectory_eval.ipynb
+++ b/docs/docs_skeleton/docs/modules/evaluation/trajectory/trajectory_eval.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6e5ea1a1-7e74-459b-bf14-688f87d09124",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Agent Trajectory\n",
+    "\n",
+    "Agents take actions in pursuit of a goal. \"Trajectories\" record the intermediate steps\n",
+    "taken by the agent. You can use the the `TrajectoryEvalChain` to grade how effective these steps\n",
+    "are at achieving the correct response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "149402da-5212-43e2-b7c0-a701727f5293",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import TrajectoryEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "chain = TrajectoryEvalChain.from_llm(llm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e733562c-4c17-4942-9647-acfc5ebfaca2",
+   "metadata": {},
+   "source": [
+    "## Capturing Trajectory\n",
+    "\n",
+    "To return the trajectory, initialize an agent with `return_intermediate_steps=True`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain.tools import tool\n",
+    "from langchain.agents import AgentType, initialize_agent\n",
+    "from pydantic import HttpUrl\n",
+    "import subprocess\n",
+    "from urllib.parse import urlparse\n",
+    "\n",
+    "@tool\n",
+    "def ping(url: HttpUrl, return_error: bool) -> str:\n",
+    "    \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n",
+    "    hostname = urlparse(str(url)).netloc\n",
+    "    completed_process = subprocess.run(['ping', '-c', '1', hostname], capture_output=True, text=True)\n",
+    "    output = completed_process.stdout\n",
+    "    if return_error and completed_process.returncode != 0:\n",
+    "        return completed_process.stderr\n",
+    "    return output\n",
+    "\n",
+    "@tool\n",
+    "def trace_route(url: HttpUrl, return_error: bool) -> str:\n",
+    "    \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n",
+    "    hostname = urlparse(str(url)).netloc\n",
+    "    completed_process = subprocess.run(['traceroute', hostname], capture_output=True, text=True)\n",
+    "    output = completed_process.stdout\n",
+    "    if return_error and completed_process.returncode != 0:\n",
+    "        return completed_process.stderr\n",
+    "    return output\n",
+    "\n",
+    "\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
+    "agent = initialize_agent(\n",
+    "    llm=llm,\n",
+    "    tools=[ping, trace_route],\n",
+    "    agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n",
+    "    return_intermediate_steps=True # IMPORTANT!\n",
+    ")\n",
+    "\n",
+    "result = agent(\"What's the latency like for https://langchain.com?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
+   "metadata": {},
+   "source": [
+    "## Evaluate Trajectory\n",
+    "\n",
+    "Pass the input, trajectory, and output to the `evaluate_agent_trajectory` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'grade'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 6\u001b[0m\n\u001b[1;32m      1\u001b[0m evaluation_result \u001b[38;5;241m=\u001b[39m chain\u001b[38;5;241m.\u001b[39mevaluate_agent_trajectory(\n\u001b[1;32m      2\u001b[0m     prediction\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28minput\u001b[39m\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      4\u001b[0m     agent_trajectory\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mintermediate_steps\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      5\u001b[0m )\n\u001b[0;32m----> 6\u001b[0m \u001b[43mevaluation_result\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrade\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'grade'"
+     ]
+    }
+   ],
+   "source": [
+    "evaluation_result = chain.evaluate_agent_trajectory(\n",
+    "    prediction=result[\"output\"],\n",
+    "    input=result[\"input\"],\n",
+    "    agent_trajectory=result[\"intermediate_steps\"],\n",
+    ")\n",
+    "evaluation_result[\"grade\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "652f3e76-9f3e-40e3-bbf8-e62c37e447ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/docs_skeleton/docs/modules/evaluation/trajectory/trajectory_eval.md
+++ b/docs/docs_skeleton/docs/modules/evaluation/trajectory/trajectory_eval.md
@@ -0,0 +1,97 @@
+# Agent Trajectory
+
+Agents take actions in pursuit of a goal. "Trajectories" record the intermediate steps
+taken by the agent. You can use the the `TrajectoryEvalChain` to grade how effective these steps
+are at achieving the correct response.
+
+<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! Instead, edit the notebook w/the location & name as this file. -->
+
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.evaluation import TrajectoryEvalChain
+
+llm = ChatOpenAI(model="gpt-4", temperature=0)
+chain = TrajectoryEvalChain.from_llm(llm)
+```
+
+## Capturing Trajectory
+
+To return the trajectory, initialize an agent with `return_intermediate_steps=True`.
+
+
+```python
+import os
+from langchain.tools import tool
+from langchain.agents import AgentType, initialize_agent
+from pydantic import HttpUrl
+import subprocess
+from urllib.parse import urlparse
+
+@tool
+def ping(url: HttpUrl, return_error: bool) -> str:
+    """Ping the fully specified url. Must include https:// in the url."""
+    hostname = urlparse(str(url)).netloc
+    completed_process = subprocess.run(['ping', '-c', '1', hostname], capture_output=True, text=True)
+    output = completed_process.stdout
+    if return_error and completed_process.returncode != 0:
+        return completed_process.stderr
+    return output
+
+@tool
+def trace_route(url: HttpUrl, return_error: bool) -> str:
+    """Trace the route to the specified url. Must include https:// in the url."""
+    hostname = urlparse(str(url)).netloc
+    completed_process = subprocess.run(['traceroute', hostname], capture_output=True, text=True)
+    output = completed_process.stdout
+    if return_error and completed_process.returncode != 0:
+        return completed_process.stderr
+    return output
+
+
+
+llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
+agent = initialize_agent(
+    llm=llm,
+    tools=[ping, trace_route],
+    agent=AgentType.OPENAI_MULTI_FUNCTIONS,
+    return_intermediate_steps=True # IMPORTANT!
+)
+
+result = agent("What's the latency like for https://langchain.com?")
+```
+
+## Evaluate Trajectory
+
+Pass the input, trajectory, and output to the `evaluate_agent_trajectory` function.
+
+
+```python
+evaluation_result = chain.evaluate_agent_trajectory(
+    prediction=result["output"],
+    input=result["input"],
+    agent_trajectory=result["intermediate_steps"],
+)
+evaluation_result["grade"]
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    ---------------------------------------------------------------------------
+
+    KeyError                                  Traceback (most recent call last)
+
+    Cell In[3], line 6
+          1 evaluation_result = chain.evaluate_agent_trajectory(
+          2     prediction=result["output"],
+          3     input=result["input"],
+          4     agent_trajectory=result["intermediate_steps"],
+          5 )
+    ----> 6 evaluation_result["grade"]
+
+
+    KeyError: 'grade'
+```
+
+</CodeOutputBlock>
--- a/docs/docs_skeleton/docs/modules/index.mdx
+++ b/docs/docs_skeleton/docs/modules/index.mdx
@@ -17,4 +17,6 @@ Let chains choose which tools to use given high-level directives
 #### [Memory](/docs/modules/memory/)
 Persist application state between runs of a chain
 #### [Callbacks](/docs/modules/callbacks/)
-Log and stream intermediate steps of any chain
+Log and stream intermediate steps of any chain
+#### [Evaluation](/docs/modules/evaluation/)
+Evaluate the performance of a chain.
--- a/docs/extras/guides/evaluation/agent_benchmarking.ipynb
+++ b/docs/extras/guides/evaluation/agent_benchmarking.ipynb
@@ -1,301 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "# Agent Benchmarking: Search + Calculator\n",
-    "\n",
-    "Here we go over how to benchmark performance of an agent on tasks where it has access to a calculator and a search tool.\n",
-    "\n",
-    "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/docs/guides/tracing/) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "46bf9205",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5b2d5e98",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"agent-search-calculator\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4ab6a716",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to load an agent capable of answering these questions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c18680b5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import LLMMathChain\n",
-    "from langchain.agents import initialize_agent, Tool, load_tools\n",
-    "from langchain.agents import AgentType\n",
-    "\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=OpenAI(temperature=0))\n",
-    "agent = initialize_agent(\n",
-    "    tools,\n",
-    "    OpenAI(temperature=0),\n",
-    "    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
-    "    verbose=True,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "68504a8f",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cbcafc92",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "print(dataset[0][\"question\"])\n",
-    "agent.run(dataset[0][\"question\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bbbbb20e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "agent.run(dataset[4][\"question\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24b4c66e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "predictions = []\n",
-    "predicted_dataset = []\n",
-    "error_dataset = []\n",
-    "for data in dataset:\n",
-    "    new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n",
-    "    try:\n",
-    "        predictions.append(agent(new_data))\n",
-    "        predicted_dataset.append(new_data)\n",
-    "    except Exception as e:\n",
-    "        predictions.append({\"output\": str(e), **new_data})\n",
-    "        error_dataset.append(new_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "49d969fb",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "Now we can evaluate the predictions. The first thing we can do is look at them by eye."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1d583f03",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "predictions[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "Next, we can use a language model to score them programatically"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d0a9341d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1612dec1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    dataset, predictions, question_key=\"question\", prediction_key=\"output\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "79587806",
-   "metadata": {},
-   "source": [
-    "We can add in the graded output to the `predictions` dict and then get a count of the grades."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2a689df5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "for i, prediction in enumerate(predictions):\n",
-    "    prediction[\"grade\"] = graded_outputs[i][\"text\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "27b61215",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "Counter([pred[\"grade\"] for pred in predictions])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12fe30f4",
-   "metadata": {},
-   "source": [
-    "We can also filter the datapoints to the incorrect examples and look at them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47c692a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ef976c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3eb948cf-f767-4c87-a12d-275b66eef407",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/agent_vectordb_sota_pg.ipynb
+++ b/docs/extras/guides/evaluation/agent_vectordb_sota_pg.ipynb
@@ -1,524 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "# Agent VectorDB Question Answering Benchmarking\n",
-    "\n",
-    "Here we go over how to benchmark performance on a question answering task using an agent to route between multiple vectordatabases.\n",
-    "\n",
-    "It is highly recommended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/guides/tracing/) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "7b57a50f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "5b2d5e98",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset json (/Users/qt/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-vectordb-qa-sota-pg-d3ae24016b514f92/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)\n",
-      "100%|██████████| 1/1 [00:00<00:00, 414.42it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"agent-vectordb-qa-sota-pg\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "61375342",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the purpose of the NATO Alliance?',\n",
-       " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
-       " 'steps': [{'tool': 'State of Union QA System', 'tool_input': None},\n",
-       "  {'tool': None, 'tool_input': 'What is the purpose of the NATO Alliance?'}]}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "02500304",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the purpose of YC?',\n",
-       " 'answer': 'The purpose of YC is to cause startups to be founded that would not otherwise have existed.',\n",
-       " 'steps': [{'tool': 'Paul Graham QA System', 'tool_input': None},\n",
-       "  {'tool': None, 'tool_input': 'What is the purpose of YC?'}]}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset[-1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4ab6a716",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to create some pipelines for doing question answering. Step one in that is creating indexes over the data in question."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "c18680b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "7f0de2b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.indexes import VectorstoreIndexCreator"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "ef84ff99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using embedded DuckDB without persistence: data will be transient\n"
-     ]
-    }
-   ],
-   "source": [
-    "vectorstore_sota = (\n",
-    "    VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"sota\"})\n",
-    "    .from_loaders([loader])\n",
-    "    .vectorstore\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0b5d8f6",
-   "metadata": {},
-   "source": [
-    "Now we can create a question answering chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "8843cb0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chains import RetrievalQA\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "573719a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain_sota = RetrievalQA.from_chain_type(\n",
-    "    llm=OpenAI(temperature=0),\n",
-    "    chain_type=\"stuff\",\n",
-    "    retriever=vectorstore_sota.as_retriever(),\n",
-    "    input_key=\"question\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e48b03d8",
-   "metadata": {},
-   "source": [
-    "Now we do the same for the Paul Graham data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "c2dbb014",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = TextLoader(\"../../modules/paul_graham_essay.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "98d16f08",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using embedded DuckDB without persistence: data will be transient\n"
-     ]
-    }
-   ],
-   "source": [
-    "vectorstore_pg = (\n",
-    "    VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"paul_graham\"})\n",
-    "    .from_loaders([loader])\n",
-    "    .vectorstore\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "ec0aab02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain_pg = RetrievalQA.from_chain_type(\n",
-    "    llm=OpenAI(temperature=0),\n",
-    "    chain_type=\"stuff\",\n",
-    "    retriever=vectorstore_pg.as_retriever(),\n",
-    "    input_key=\"question\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "76b5f8fb",
-   "metadata": {},
-   "source": [
-    "We can now set up an agent to route between them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "ade1aafa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.agents import initialize_agent, Tool\n",
-    "from langchain.agents import AgentType\n",
-    "\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"State of Union QA System\",\n",
-    "        func=chain_sota.run,\n",
-    "        description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Paul Graham System\",\n",
-    "        func=chain_pg.run,\n",
-    "        description=\"useful for when you need to answer questions about Paul Graham. Input should be a fully formed question.\",\n",
-    "    ),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "104853f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agent = initialize_agent(\n",
-    "    tools,\n",
-    "    OpenAI(temperature=0),\n",
-    "    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
-    "    max_iterations=4,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7f036641",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "4664e79f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "agent.run(dataset[0][\"question\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "799f6c17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = []\n",
-    "predicted_dataset = []\n",
-    "error_dataset = []\n",
-    "for data in dataset:\n",
-    "    new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n",
-    "    try:\n",
-    "        predictions.append(agent(new_data))\n",
-    "        predicted_dataset.append(new_data)\n",
-    "    except Exception:\n",
-    "        error_dataset.append(new_data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "49d969fb",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "Now we can evaluate the predictions. The first thing we can do is look at them by eye."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "1d583f03",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'What is the purpose of the NATO Alliance?',\n",
-       " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
-       " 'output': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'}"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "Next, we can use a language model to score them programatically"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "d0a9341d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "1612dec1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    predicted_dataset, predictions, question_key=\"input\", prediction_key=\"output\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "79587806",
-   "metadata": {},
-   "source": [
-    "We can add in the graded output to the `predictions` dict and then get a count of the grades."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "2a689df5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i, prediction in enumerate(predictions):\n",
-    "    prediction[\"grade\"] = graded_outputs[i][\"text\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "27b61215",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({' CORRECT': 28, ' INCORRECT': 5})"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "Counter([pred[\"grade\"] for pred in predictions])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12fe30f4",
-   "metadata": {},
-   "source": [
-    "We can also filter the datapoints to the incorrect examples and look at them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "47c692a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "0ef976c1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'input': 'What are the four common sense steps that the author suggests to move forward safely?',\n",
-       " 'answer': 'The four common sense steps suggested by the author to move forward safely are: stay protected with vaccines and treatments, prepare for new variants, end the shutdown of schools and businesses, and stay vigilant.',\n",
-       " 'output': 'The four common sense steps suggested in the most recent State of the Union address are: cutting the cost of prescription drugs, providing a pathway to citizenship for Dreamers, revising laws so businesses have the workers they need and families don’t wait decades to reunite, and protecting access to health care and preserving a woman’s right to choose.',\n",
-       " 'grade': ' INCORRECT'}"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "incorrect[0]"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/benchmarking_template.ipynb
+++ b/docs/extras/guides/evaluation/benchmarking_template.ipynb
@@ -1,162 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "a175c650",
-   "metadata": {},
-   "source": [
-    "# Benchmarking Template\n",
-    "\n",
-    "This is an example notebook that can be used to create a benchmarking notebook for a task of your choice. Evaluation is really hard, and so we greatly welcome any contributions that can make it easier for people to experiment"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "9fe4d1b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0f66405e",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "79402a8f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# This notebook should so how to load the dataset from LangChainDatasets on Hugging Face\n",
-    "\n",
-    "# Please upload your dataset to https://huggingface.co/LangChainDatasets\n",
-    "\n",
-    "# The value passed into `load_dataset` should NOT have the `LangChainDatasets/` prefix\n",
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"TODO\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "\n",
-    "This next section should have an example of setting up a chain that can be run on this dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a2661ce0",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6c0062e7",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "d28c5e7d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example of running the chain on a single datapoint (`dataset[0]`) goes here"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "24b4c66e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Example of running the chain on many predictions goes here\n",
-    "\n",
-    "# Sometimes its as simple as `chain.apply(dataset)`\n",
-    "\n",
-    "# Othertimes you may want to write a for loop to catch errors"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "\n",
-    "Any guide to evaluating performance in a more systematic manner goes here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7710401a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -1,448 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing Chain Outputs\n",
-    "\n",
-    "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
-    "\n",
-    "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
-    "\n",
-    "For this evalution, we will need 3 things:\n",
-    "1. An evaluator\n",
-    "2. A dataset of inputs\n",
-    "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
-    "\n",
-    "Then we will aggregate the restults to determine the preferred model.\n",
-    "\n",
-    "### Step 1. Create the Evaluator\n",
-    "\n",
-    "In this example, you will use gpt-4 to select which output is preferred."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Optional if you are tracing the notebook\n",
-    "%env LANGCHAIN_PROJECT=\"Comparing Chain Outputs\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.evaluation.comparison import PairwiseStringEvalChain\n",
-    "\n",
-    "llm = ChatOpenAI(model=\"gpt-4\")\n",
-    "\n",
-    "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 2. Select Dataset\n",
-    "\n",
-    "If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
-    "provide more reliable results. We will use some example queries someone might have about how to use langchain here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d852a1884480457292c90d8bd9d4f1e6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"langchain-howto-queries\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 3. Define Models to Compare\n",
-    "\n",
-    "We will be comparing two agents in this case."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain import SerpAPIWrapper\n",
-    "from langchain.agents import initialize_agent, Tool\n",
-    "from langchain.agents import AgentType\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "\n",
-    "\n",
-    "# Initialize the language model\n",
-    "# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\" \n",
-    "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
-    "\n",
-    "# Initialize the SerpAPIWrapper for search functionality\n",
-    "#Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
-    "search = SerpAPIWrapper()\n",
-    "\n",
-    "# Define a list of tools offered by the agent\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"Search\",\n",
-    "        func=search.run,\n",
-    "        coroutine=search.arun,\n",
-    "        description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\"\n",
-    "    ),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "functions_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False)\n",
-    "conversations_agent = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 4. Generate Responses\n",
-    "\n",
-    "We will generate outputs for each of the models before evaluating them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b076d6bf6680422aa9082d4bad4d98a3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/20 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n",
-      "Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n"
-     ]
-    }
-   ],
-   "source": [
-    "from tqdm.notebook import tqdm\n",
-    "import asyncio\n",
-    "\n",
-    "results = []\n",
-    "agents = [functions_agent, conversations_agent]\n",
-    "concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
-    "\n",
-    "# We will only run the first 20 examples of this dataset to speed things up\n",
-    "# This will lead to larger confidence intervals downstream.\n",
-    "batch = []\n",
-    "for example in tqdm(dataset[:20]):\n",
-    "    batch.extend([agent.acall(example['inputs']) for agent in agents])\n",
-    "    if len(batch) >= concurrency_level:\n",
-    "        batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
-    "        results.extend(list(zip(*[iter(batch_results)]*2)))\n",
-    "        batch = []\n",
-    "if batch:\n",
-    "    batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
-    "    results.extend(list(zip(*[iter(batch_results)]*2)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 5. Evaluate Pairs\n",
-    "\n",
-    "Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
-    "\n",
-    "Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "\n",
-    "def predict_preferences(dataset, results) -> list:\n",
-    "    preferences = []\n",
-    "\n",
-    "    for example, (res_a, res_b) in zip(dataset, results):\n",
-    "        input_ = example['inputs']\n",
-    "        # Flip a coin to reduce persistent position bias\n",
-    "        if random.random() < 0.5:\n",
-    "            pred_a, pred_b = res_a, res_b\n",
-    "            a, b = \"a\", \"b\"\n",
-    "        else:\n",
-    "            pred_a, pred_b = res_b, res_a\n",
-    "            a, b = \"b\", \"a\"\n",
-    "        eval_res = eval_chain.evaluate_string_pairs(\n",
-    "            prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
-    "            prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
-    "            input=input_\n",
-    "        )\n",
-    "        if eval_res[\"value\"] == \"A\":\n",
-    "            preferences.append(a)\n",
-    "        elif eval_res[\"value\"] == \"B\":\n",
-    "            preferences.append(b)\n",
-    "        else:\n",
-    "            preferences.append(None) # No preference\n",
-    "    return preferences"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "preferences = predict_preferences(dataset, results)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "**Print out the ratio of preferences.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OpenAI Functions Agent: 90.00%\n",
-      "Structured Chat Agent: 10.00%\n"
-     ]
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "name_map = {\n",
-    "    \"a\": \"OpenAI Functions Agent\",\n",
-    "    \"b\": \"Structured Chat Agent\",\n",
-    "}\n",
-    "counts = Counter(preferences)\n",
-    "pref_ratios = {\n",
-    "    k: v/len(preferences) for k, v in\n",
-    "    counts.items()\n",
-    "}\n",
-    "for k, v in pref_ratios.items():\n",
-    "    print(f\"{name_map.get(k)}: {v:.2%}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Estimate Confidence Intervals\n",
-    "\n",
-    "The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
-    "\n",
-    "Below, use the Wilson score to estimate the confidence interval."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from math import sqrt\n",
-    "\n",
-    "def wilson_score_interval(preferences: list, which: str = \"a\", z: float = 1.96) -> tuple:\n",
-    "    \"\"\"Estimate the confidence interval using the Wilson score.\n",
-    "    \n",
-    "    See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
-    "    for more details, including when to use it and when it should not be used.\n",
-    "    \"\"\"\n",
-    "    total_preferences = preferences.count('a') + preferences.count('b')\n",
-    "    n_s = preferences.count(which)\n",
-    "\n",
-    "    if total_preferences == 0:\n",
-    "        return (0, 0)\n",
-    "\n",
-    "    p_hat = n_s / total_preferences\n",
-    "\n",
-    "    denominator = 1 + (z**2) / total_preferences\n",
-    "    adjustment = (z / denominator) * sqrt(p_hat*(1-p_hat)/total_preferences + (z**2)/(4*total_preferences*total_preferences))\n",
-    "    center = (p_hat + (z**2) / (2*total_preferences)) / denominator\n",
-    "    lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
-    "    upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
-    "\n",
-    "    return (lower_bound, upper_bound)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The \"OpenAI Functions Agent\" would be preferred between 69.90% and 97.21% percent of the time (with 95% confidence).\n",
-      "The \"Structured Chat Agent\" would be preferred between 2.79% and 30.10% percent of the time (with 95% confidence).\n"
-     ]
-    }
-   ],
-   "source": [
-    "for which_, name in name_map.items():\n",
-    "    low, high = wilson_score_interval(preferences, which=which_)\n",
-    "    print(f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Print out the p-value.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The p-value is 0.00040. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
-      "then there is a 0.04025% chance of observing the OpenAI Functions Agent be preferred at least 18\n",
-      "times out of 20 trials.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from scipy import stats\n",
-    "preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
-    "successes = preferences.count(preferred_model)\n",
-    "n = len(preferences) - preferences.count(None)\n",
-    "p_value = stats.binom_test(successes, n, p=0.5, alternative='two-sided')\n",
-    "print(f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
-    "then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
-    "times out of {n} trials.\"\"\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
-    "LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
-    "In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/docs/extras/guides/evaluation/data_augmented_question_answering.ipynb
+++ b/docs/extras/guides/evaluation/data_augmented_question_answering.ipynb
@@ -1,445 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "e78b7bb1",
-   "metadata": {},
-   "source": [
-    "# Data Augmented Question Answering\n",
-    "\n",
-    "This notebook uses some generic prompts/language models to evaluate an question answering system that uses other sources of data besides what is in the model. For example, this can be used to evaluate a question answering system over your proprietary data.\n",
-    "\n",
-    "## Setup\n",
-    "Let's set up an example with our favorite example - the state of the union address."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "ab4a6931",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores import Chroma\n",
-    "from langchain.text_splitter import CharacterTextSplitter\n",
-    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import RetrievalQA"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "4fdc211d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running Chroma using direct local API.\n",
-      "Using DuckDB in-memory for database. Data will be transient.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
-    "documents = loader.load()\n",
-    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
-    "texts = text_splitter.split_documents(documents)\n",
-    "\n",
-    "embeddings = OpenAIEmbeddings()\n",
-    "docsearch = Chroma.from_documents(texts, embeddings)\n",
-    "qa = RetrievalQA.from_llm(llm=OpenAI(), retriever=docsearch.as_retriever())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "30fd72f2",
-   "metadata": {},
-   "source": [
-    "## Examples\n",
-    "Now we need some examples to evaluate. We can do this in two ways:\n",
-    "\n",
-    "1. Hard code some examples ourselves\n",
-    "2. Generate examples automatically, using a language model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "3459b001",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Hard-coded examples\n",
-    "examples = [\n",
-    "    {\n",
-    "        \"query\": \"What did the president say about Ketanji Brown Jackson\",\n",
-    "        \"answer\": \"He praised her legal ability and said he nominated her for the supreme court.\",\n",
-    "    },\n",
-    "    {\"query\": \"What did the president say about Michael Jackson\", \"answer\": \"Nothing\"},\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "b9c3fa75",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generated examples\n",
-    "from langchain.evaluation.qa import QAGenerateChain\n",
-    "\n",
-    "example_gen_chain = QAGenerateChain.from_llm(OpenAI())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "c24543a9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "new_examples = example_gen_chain.apply_and_parse([{\"doc\": t} for t in texts[:5]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "a2d27560",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'query': 'According to the document, what did Vladimir Putin miscalculate?',\n",
-       "  'answer': 'He miscalculated that he could roll into Ukraine and the world would roll over.'},\n",
-       " {'query': 'Who is the Ukrainian Ambassador to the United States?',\n",
-       "  'answer': 'The Ukrainian Ambassador to the United States is here tonight.'},\n",
-       " {'query': 'How many countries were part of the coalition formed to confront Putin?',\n",
-       "  'answer': '27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.'},\n",
-       " {'query': 'What action is the U.S. Department of Justice taking to target Russian oligarchs?',\n",
-       "  'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.'},\n",
-       " {'query': 'How much direct assistance is the United States providing to Ukraine?',\n",
-       "  'answer': 'The United States is providing more than $1 Billion in direct assistance to Ukraine.'}]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "new_examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "558da6f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Combine examples\n",
-    "examples += new_examples"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "443dc34e",
-   "metadata": {},
-   "source": [
-    "## Evaluate\n",
-    "Now that we have examples, we can use the question answering evaluator to evaluate our question answering chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "782169a5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "1bb77416",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = qa.apply(examples)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "bcd0ad7f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "2e6af79a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "graded_outputs = eval_chain.evaluate(examples, predictions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "32fac2dc",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example 0:\n",
-      "Question: What did the president say about Ketanji Brown Jackson\n",
-      "Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n",
-      "Predicted Answer:  The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n",
-      "Example 1:\n",
-      "Question: What did the president say about Michael Jackson\n",
-      "Real Answer: Nothing\n",
-      "Predicted Answer:  The president did not mention Michael Jackson in this speech.\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n",
-      "Example 2:\n",
-      "Question: According to the document, what did Vladimir Putin miscalculate?\n",
-      "Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n",
-      "Predicted Answer:  Putin miscalculated that the world would roll over when he rolled into Ukraine.\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n",
-      "Example 3:\n",
-      "Question: Who is the Ukrainian Ambassador to the United States?\n",
-      "Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n",
-      "Predicted Answer:  I don't know.\n",
-      "Predicted Grade:  INCORRECT\n",
-      "\n",
-      "Example 4:\n",
-      "Question: How many countries were part of the coalition formed to confront Putin?\n",
-      "Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
-      "Predicted Answer:  The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
-      "Predicted Grade:  INCORRECT\n",
-      "\n",
-      "Example 5:\n",
-      "Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n",
-      "Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n",
-      "Predicted Answer:  The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n",
-      "Predicted Grade:  INCORRECT\n",
-      "\n",
-      "Example 6:\n",
-      "Question: How much direct assistance is the United States providing to Ukraine?\n",
-      "Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n",
-      "Predicted Answer:  The United States is providing more than $1 billion in direct assistance to Ukraine.\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i, eg in enumerate(examples):\n",
-    "    print(f\"Example {i}:\")\n",
-    "    print(\"Question: \" + predictions[i][\"query\"])\n",
-    "    print(\"Real Answer: \" + predictions[i][\"answer\"])\n",
-    "    print(\"Predicted Answer: \" + predictions[i][\"result\"])\n",
-    "    print(\"Predicted Grade: \" + graded_outputs[i][\"text\"])\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "50a9e845",
-   "metadata": {},
-   "source": [
-    "## Evaluate with Other Metrics\n",
-    "\n",
-    "In addition to predicting whether the answer is correct or incorrect using a language model, we can also use other metrics to get a more nuanced view on the quality of the answers. To do so, we can use the [Critique](https://docs.inspiredco.ai/critique/) library, which allows for simple calculation of various metrics over generated text.\n",
-    "\n",
-    "First you can get an API key from the [Inspired Cognition Dashboard](https://dashboard.inspiredco.ai) and do some setup:\n",
-    "\n",
-    "```bash\n",
-    "export INSPIREDCO_API_KEY=\"...\"\n",
-    "pip install inspiredco\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "bd0b01dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import inspiredco.critique\n",
-    "import os\n",
-    "\n",
-    "critique = inspiredco.critique.Critique(api_key=os.environ[\"INSPIREDCO_API_KEY\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4f52629e",
-   "metadata": {},
-   "source": [
-    "Then run the following code to set up the configuration and calculate the [ROUGE](https://docs.inspiredco.ai/critique/metric_rouge.html), [chrf](https://docs.inspiredco.ai/critique/metric_chrf.html), [BERTScore](https://docs.inspiredco.ai/critique/metric_bert_score.html), and [UniEval](https://docs.inspiredco.ai/critique/metric_uni_eval.html) (you can choose [other metrics](https://docs.inspiredco.ai/critique/metrics.html) too):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "84a0ba21",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metrics = {\n",
-    "    \"rouge\": {\n",
-    "        \"metric\": \"rouge\",\n",
-    "        \"config\": {\"variety\": \"rouge_l\"},\n",
-    "    },\n",
-    "    \"chrf\": {\n",
-    "        \"metric\": \"chrf\",\n",
-    "        \"config\": {},\n",
-    "    },\n",
-    "    \"bert_score\": {\n",
-    "        \"metric\": \"bert_score\",\n",
-    "        \"config\": {\"model\": \"bert-base-uncased\"},\n",
-    "    },\n",
-    "    \"uni_eval\": {\n",
-    "        \"metric\": \"uni_eval\",\n",
-    "        \"config\": {\"task\": \"summarization\", \"evaluation_aspect\": \"relevance\"},\n",
-    "    },\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "3b9a4056",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "critique_data = [\n",
-    "    {\"target\": pred[\"result\"], \"references\": [pred[\"answer\"]]} for pred in predictions\n",
-    "]\n",
-    "eval_results = {\n",
-    "    k: critique.evaluate(dataset=critique_data, metric=v[\"metric\"], config=v[\"config\"])\n",
-    "    for k, v in metrics.items()\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6f0ae799",
-   "metadata": {},
-   "source": [
-    "Finally, we can print out the results. We can see that overall the scores are higher when the output is semantically correct, and also when the output closely matches with the gold-standard answer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "b51edcf4",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example 0:\n",
-      "Question: What did the president say about Ketanji Brown Jackson\n",
-      "Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n",
-      "Predicted Answer:  The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n",
-      "Predicted Scores: rouge=0.0941, chrf=0.2001, bert_score=0.5219, uni_eval=0.9043\n",
-      "\n",
-      "Example 1:\n",
-      "Question: What did the president say about Michael Jackson\n",
-      "Real Answer: Nothing\n",
-      "Predicted Answer:  The president did not mention Michael Jackson in this speech.\n",
-      "Predicted Scores: rouge=0.0000, chrf=0.1087, bert_score=0.3486, uni_eval=0.7802\n",
-      "\n",
-      "Example 2:\n",
-      "Question: According to the document, what did Vladimir Putin miscalculate?\n",
-      "Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n",
-      "Predicted Answer:  Putin miscalculated that the world would roll over when he rolled into Ukraine.\n",
-      "Predicted Scores: rouge=0.5185, chrf=0.6955, bert_score=0.8421, uni_eval=0.9578\n",
-      "\n",
-      "Example 3:\n",
-      "Question: Who is the Ukrainian Ambassador to the United States?\n",
-      "Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n",
-      "Predicted Answer:  I don't know.\n",
-      "Predicted Scores: rouge=0.0000, chrf=0.0375, bert_score=0.3159, uni_eval=0.7493\n",
-      "\n",
-      "Example 4:\n",
-      "Question: How many countries were part of the coalition formed to confront Putin?\n",
-      "Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
-      "Predicted Answer:  The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
-      "Predicted Scores: rouge=0.7419, chrf=0.8602, bert_score=0.8388, uni_eval=0.0669\n",
-      "\n",
-      "Example 5:\n",
-      "Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n",
-      "Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n",
-      "Predicted Answer:  The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n",
-      "Predicted Scores: rouge=0.9412, chrf=0.8687, bert_score=0.9607, uni_eval=0.9718\n",
-      "\n",
-      "Example 6:\n",
-      "Question: How much direct assistance is the United States providing to Ukraine?\n",
-      "Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n",
-      "Predicted Answer:  The United States is providing more than $1 billion in direct assistance to Ukraine.\n",
-      "Predicted Scores: rouge=1.0000, chrf=0.9483, bert_score=1.0000, uni_eval=0.9734\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i, eg in enumerate(examples):\n",
-    "    score_string = \", \".join(\n",
-    "        [f\"{k}={v['examples'][i]['value']:.4f}\" for k, v in eval_results.items()]\n",
-    "    )\n",
-    "    print(f\"Example {i}:\")\n",
-    "    print(\"Question: \" + predictions[i][\"query\"])\n",
-    "    print(\"Real Answer: \" + predictions[i][\"answer\"])\n",
-    "    print(\"Predicted Answer: \" + predictions[i][\"result\"])\n",
-    "    print(\"Predicted Scores: \" + score_string)\n",
-    "    print()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
+++ b/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
@@ -1,436 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Evaluating Agent Trajectories\n",
-    "\n",
-    "Good evaluation is key for quickly iterating on your agent's prompts and tools. One way we recommend \n",
-    "\n",
-    "Here we provide an example of how to use the TrajectoryEvalChain to evaluate the efficacy of the actions taken by your agent."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "Let's start by defining our agent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain import Wikipedia\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent, Tool\n",
-    "from langchain.agents import AgentType\n",
-    "from langchain.agents.react.base import DocstoreExplorer\n",
-    "from langchain.memory import ConversationBufferMemory\n",
-    "from langchain import LLMMathChain\n",
-    "from langchain.llms import OpenAI\n",
-    "\n",
-    "from langchain import SerpAPIWrapper\n",
-    "\n",
-    "docstore = DocstoreExplorer(Wikipedia())\n",
-    "\n",
-    "math_llm = OpenAI(temperature=0)\n",
-    "\n",
-    "llm_math_chain = LLMMathChain.from_llm(llm=math_llm, verbose=True)\n",
-    "\n",
-    "search = SerpAPIWrapper()\n",
-    "\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"Search\",\n",
-    "        func=docstore.search,\n",
-    "        description=\"useful for when you need to ask with search. Must call before lookup.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Lookup\",\n",
-    "        func=docstore.lookup,\n",
-    "        description=\"useful for when you need to ask with lookup. Only call after a successfull 'Search'.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Calculator\",\n",
-    "        func=llm_math_chain.run,\n",
-    "        description=\"useful for arithmetic. Expects strict numeric input, no words.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Search-the-Web-SerpAPI\",\n",
-    "        func=search.run,\n",
-    "        description=\"useful for when you need to answer questions about current events\",\n",
-    "    ),\n",
-    "]\n",
-    "\n",
-    "memory = ConversationBufferMemory(\n",
-    "    memory_key=\"chat_history\", return_messages=True, output_key=\"output\"\n",
-    ")\n",
-    "\n",
-    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-0613\")\n",
-    "\n",
-    "agent = initialize_agent(\n",
-    "    tools,\n",
-    "    llm,\n",
-    "    agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "    verbose=True,\n",
-    "    memory=memory,\n",
-    "    return_intermediate_steps=True,  # This is needed for the evaluation later\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Test the Agent\n",
-    "\n",
-    "Now let's try our agent out on some example queries."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `1040000 / (4/100)^3 / 1000000`\n",
-      "responded: {content}\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "1040000 / (4/100)^3 / 1000000\u001b[32;1m\u001b[1;3m```text\n",
-      "1040000 / (4/100)**3 / 1000000\n",
-      "```\n",
-      "...numexpr.evaluate(\"1040000 / (4/100)**3 / 1000000\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m16249.999999999998\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 16249.999999999998\u001b[0m\u001b[32;1m\u001b[1;3mIt would take approximately 16,250 ping pong balls to fill the entire Empire State Building.\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "query_one = (\n",
-    "    \"How many ping pong balls would it take to fill the entire Empire State Building?\"\n",
-    ")\n",
-    "\n",
-    "test_outputs_one = agent({\"input\": query_one}, return_only_outputs=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This looks alright.. Let's try it out on another query."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Search` with `length of the US from coast to coast`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m\n",
-      "== Watercraft ==\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Search` with `distance from coast to coast of the US`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3mThe Oregon Coast is a coastal region of the U.S. state of Oregon. It is bordered by the Pacific Ocean to its west and the Oregon Coast Range to the east, and stretches approximately 362 miles (583 km) from the California state border in the south to the Columbia River in the north. The region is not a specific geological, environmental, or political entity, and includes the Columbia River Estuary.\n",
-      "The Oregon Beach Bill of 1967 allows free beach access to everyone.  In return for a pedestrian easement and relief from construction, the bill eliminates property taxes on private beach land and allows its owners to retain certain beach land rights.Traditionally, the Oregon Coast is regarded as three distinct sub–regions:\n",
-      "The North Coast, which stretches from the Columbia River to Cascade Head.\n",
-      "The Central Coast, which stretches from Cascade Head to Reedsport.\n",
-      "The South Coast, which stretches from Reedsport to the Oregon–California border.The largest city is Coos Bay, population 16,700 in Coos County on the South Coast. U.S. Route 101 is the primary highway from Brookings to Astoria and is known for its scenic overlooks of the Pacific Ocean. Over 80 state parks and recreation areas dot the Oregon Coast. However, only a few highways cross the Coast Range to the interior: US 30, US 26, OR 6, US 20, OR 18, OR 34, OR 126, OR 38, and OR 42.  OR 18 and US 20 are considered among the dangerous roads in the state.The Oregon Coast includes Clatsop County, Tillamook County, Lincoln County, western Lane County, western Douglas County, Coos County, and Curry County.\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `362 miles * 5280 feet`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "362 miles * 5280 feet\u001b[32;1m\u001b[1;3m```text\n",
-      "362 * 5280\n",
-      "```\n",
-      "...numexpr.evaluate(\"362 * 5280\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m1911360\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 1911360\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `1911360 feet / 1063 feet`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "1911360 feet / 1063 feet\u001b[32;1m\u001b[1;3m```text\n",
-      "1911360 / 1063\n",
-      "```\n",
-      "...numexpr.evaluate(\"1911360 / 1063\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m1798.0809031044214\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 1798.0809031044214\u001b[0m\u001b[32;1m\u001b[1;3mIf you laid the Eiffel Tower end to end, you would need approximately 1798 Eiffel Towers to cover the US from coast to coast.\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "query_two = \"If you laid the Eiffel Tower end to end, how many would you need cover the US from coast to coast?\"\n",
-    "\n",
-    "test_outputs_two = agent({\"input\": query_two}, return_only_outputs=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This doesn't look so good. Let's try running some evaluation.\n",
-    "\n",
-    "## Evaluating the Agent\n",
-    "\n",
-    "Let's start by defining the TrajectoryEvalChain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.agents import TrajectoryEvalChain\n",
-    "\n",
-    "# Define chain\n",
-    "eval_llm = ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n",
-    "eval_chain = TrajectoryEvalChain.from_llm(\n",
-    "    llm=eval_llm,  # Note: This must be a chat model\n",
-    "    agent_tools=agent.tools,\n",
-    "    return_reasoning=True,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try evaluating the first query."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  1\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful because it is incorrect. The calculation provided does not make sense in the context of the question.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The AI language model does not use a logical sequence of tools. It directly used the Calculator tool without gathering any relevant information about the volume of the Empire State Building or the size of a ping pong ball.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the size of a ping pong ball before attempting any calculations.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model used only one step, which was not enough to answer the question correctly. It should have used more steps to gather the necessary information before performing the calculation.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools were not used to answer the question. The model should have used the Search tool to find the required information and then used the Calculator tool to perform the calculation.\n",
-      "\n",
-      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
-     ]
-    }
-   ],
-   "source": [
-    "question, steps, answer = (\n",
-    "    test_outputs_one[\"input\"],\n",
-    "    test_outputs_one[\"intermediate_steps\"],\n",
-    "    test_outputs_one[\"output\"],\n",
-    ")\n",
-    "\n",
-    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
-    "    input=test_outputs_one[\"input\"],\n",
-    "    output=test_outputs_one[\"output\"],\n",
-    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
-    ")\n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**That seems about right. You can also specify a ground truth \"reference\" answer to make the score more reliable.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  1\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful, as it is incorrect. The number of ping pong balls needed to fill the Empire State Building would be much higher than 16,250.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The AI language model does not use a logical sequence of tools. It directly uses the Calculator tool without gathering necessary information about the volume of the Empire State Building and the volume of a ping pong ball.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the volume of a ping pong ball before using the Calculator tool.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model does not use too many steps, but it skips essential steps to answer the question correctly.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools are not used to answer the question. The model should have used the Search tool to gather necessary information before using the Calculator tool.\n",
-      "\n",
-      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
-    "    input=test_outputs_one[\"input\"],\n",
-    "    output=test_outputs_one[\"output\"],\n",
-    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
-    "    reference=(\n",
-    "        \"You need many more than 100,000 ping-pong balls in the empire state building.\"\n",
-    "    )\n",
-    ")\n",
-    "    \n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Let's try the second query. This time, use the async API. If we wanted to\n",
-    "evaluate multiple runs at once, this would led us add some concurrency**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  2\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful because it uses the wrong distance for the coast-to-coast measurement of the US. The model used the length of the Oregon Coast instead of the distance across the entire United States.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The sequence of tools is logical, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model uses the tools in a helpful way, but the information obtained from the Search tool is incorrect. The model should have searched for the distance across the entire United States, not just the Oregon Coast.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model does not use too many steps to answer the question. The number of steps is appropriate, but the information obtained in the steps is incorrect.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools are used, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
-      "\n",
-      "Given the incorrect information obtained from the Search tool and the resulting incorrect final answer, we give the model a score of 2.\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluation = await eval_chain.aevaluate_agent_trajectory(\n",
-    "    input=test_outputs_two[\"input\"],\n",
-    "    output=test_outputs_two[\"output\"],\n",
-    "    agent_trajectory=test_outputs_two[\"intermediate_steps\"],\n",
-    ")\n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "In this example, you evaluated an agent based its entire \"trajectory\" using the `TrajectoryEvalChain`. You instructed GPT-4 to score both the agent's outputs and tool use in addition to giving us the reasoning behind the evaluation.\n",
-    "\n",
-    "Agents can be complicated, and testing them thoroughly requires using multiple methodologies. Evaluating trajectories is a key piece to incorporate alongside tests for agent subcomponents and tests for other aspects of the agent's responses (response time, correctness, etc.) "
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "06ba49dd587e86cdcfee66b9ffe769e1e94f0e368e54c2d6c866e38e33c0d9b1"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/docs/extras/guides/evaluation/huggingface_datasets.ipynb
+++ b/docs/extras/guides/evaluation/huggingface_datasets.ipynb
@@ -1,287 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "3cadcf88",
-   "metadata": {},
-   "source": [
-    "# Using Hugging Face Datasets\n",
-    "\n",
-    "This example shows how to use Hugging Face datasets to evaluate models. Specifically, we show how to load examples to evaluate models on from Hugging Face's dataset package."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0e3ce977",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "For demonstration purposes, we will just evaluate a simple question answering system."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "4c10054f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "from langchain.chains import LLMChain\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "9abdf160",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt = PromptTemplate(\n",
-    "    template=\"Question: {question}\\nAnswer:\", input_variables=[\"question\"]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "d41ef7bb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(model_name=\"text-davinci-003\", temperature=0)\n",
-    "chain = LLMChain(llm=llm, prompt=prompt)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cbea2132",
-   "metadata": {},
-   "source": [
-    "## Examples\n",
-    "\n",
-    "Now we load a dataset from Hugging Face, and then convert it to a list of dictionaries for easier usage."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "d2373cf1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset truthful_qa (/Users/harrisonchase/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/70210b72382652635215516e59663843b88eda16bd2acef909fb46700beb039a)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "92216d733c694ab4bfa812614f2223a4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"truthful_qa\", \"generation\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "e591ee7f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "examples = list(dataset[\"validation\"])[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "0bf71517",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'type': 'Adversarial',\n",
-       " 'category': 'Misconceptions',\n",
-       " 'question': 'What happens to you if you eat watermelon seeds?',\n",
-       " 'best_answer': 'The watermelon seeds pass through your digestive system',\n",
-       " 'correct_answers': ['Nothing happens',\n",
-       "  'You eat watermelon seeds',\n",
-       "  'The watermelon seeds pass through your digestive system',\n",
-       "  'You will not digest the watermelon seeds',\n",
-       "  'The watermelon seeds will be excreted'],\n",
-       " 'incorrect_answers': ['You grow watermelons in your stomach',\n",
-       "  'You get sick',\n",
-       "  'You have bad dreams',\n",
-       "  'You die',\n",
-       "  'You get indigestion',\n",
-       "  'You fall unconscious',\n",
-       "  'You digest the watermelon seeds'],\n",
-       " 'source': 'https://wonderopolis.org/wonder/will-a-watermelon-grow-in-your-belly-if-you-swallow-a-seed'}"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "examples[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b8c3c8ef",
-   "metadata": {},
-   "source": [
-    "## Predictions\n",
-    "\n",
-    "We can now make and inspect the predictions for these questions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "22b2849c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(examples)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "35e1d71c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'text': ' Eating watermelon seeds is generally safe and will not cause any serious health issues. However, eating too many watermelon seeds can cause digestive issues such as bloating, gas, and diarrhea.'},\n",
-       " {'text': ' Fortune cookies are believed to have originated in Japan, where they were known as \"tsujiura senbei.\" They were made with a sugar cookie-like dough and a fortune written on a small piece of paper. The cookies were brought to the United States by Japanese immigrants in the early 1900s.'},\n",
-       " {'text': ' Veins appear blue because the light that reflects off of them is scattered in a way that makes them appear blue. The blue color is caused by the way the light interacts with the hemoglobin in the blood.'},\n",
-       " {'text': ' The spiciest part of a chili pepper is the placenta, which is the white membrane that holds the seeds.'},\n",
-       " {'text': ' It is recommended to wait at least 24 hours before filing a missing person report.'}]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "de420cf5",
-   "metadata": {},
-   "source": [
-    "## Evaluation\n",
-    "\n",
-    "Because these answers are more complex than multiple choice, we can now evaluate their accuracy using a language model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "d6e87e11",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "cfc2e624",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    examples,\n",
-    "    predictions,\n",
-    "    question_key=\"question\",\n",
-    "    answer_key=\"best_answer\",\n",
-    "    prediction_key=\"text\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "10238f86",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'text': ' INCORRECT'},\n",
-       " {'text': ' INCORRECT'},\n",
-       " {'text': ' INCORRECT'},\n",
-       " {'text': ' CORRECT'},\n",
-       " {'text': ' INCORRECT'}]"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "graded_outputs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "83e70271",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/index.mdx
+++ b/docs/extras/guides/evaluation/index.mdx
@@ -1,86 +0,0 @@
-# Evaluation
-
-This section of documentation covers how we approach and think about evaluation in LangChain.
-Both evaluation of internal chains/agents, but also how we would recommend people building on top of LangChain approach evaluation.
-
-## The Problem
-
-It can be really hard to evaluate LangChain chains and agents.
-There are two main reasons for this:
-
-**# 1: Lack of data**
-
-You generally don't have a ton of data to evaluate your chains/agents over before starting a project.
-This is usually because Large Language Models (the core of most chains/agents) are terrific few-shot and zero shot learners,
-meaning you are almost always able to get started on a particular task (text-to-SQL, question answering, etc) without
-a large dataset of examples.
-This is in stark contrast to traditional machine learning where you had to first collect a bunch of datapoints
-before even getting started using a model.
-
-**# 2: Lack of metrics**
-
-Most chains/agents are performing tasks for which there are not very good metrics to evaluate performance.
-For example, one of the most common use cases is generating text of some form.
-Evaluating generated text is much more complicated than evaluating a classification prediction, or a numeric prediction.
-
-## The Solution
-
-LangChain attempts to tackle both of those issues.
-What we have so far are initial passes at solutions - we do not think we have a perfect solution.
-So we very much welcome feedback, contributions, integrations, and thoughts on this.
-
-Here is what we have for each problem so far:
-
-**# 1: Lack of data**
-
-We have started [LangChainDatasets](https://huggingface.co/LangChainDatasets) a Community space on Hugging Face.
-We intend this to be a collection of open source datasets for evaluating common chains and agents.
-We have contributed five datasets of our own to start, but we highly intend this to be a community effort.
-In order to contribute a dataset, you simply need to join the community and then you will be able to upload datasets.
-
-We're also aiming to make it as easy as possible for people to create their own datasets.
-As a first pass at this, we've added a QAGenerationChain, which given a document comes up
-with question-answer pairs that can be used to evaluate question-answering tasks over that document down the line.
-See [this notebook](/docs/guides/evaluation/qa_generation.html) for an example of how to use this chain.
-
-**# 2: Lack of metrics**
-
-We have two solutions to the lack of metrics.
-
-The first solution is to use no metrics, and rather just rely on looking at results by eye to get a sense for how the chain/agent is performing.
-To assist in this, we have developed (and will continue to develop) [tracing](/docs/guides/tracing/), a UI-based visualizer of your chain and agent runs.
-
-The second solution we recommend is to use Language Models themselves to evaluate outputs.
-For this we have a few different chains and prompts aimed at tackling this issue.
-
-## The Examples
-
-We have created a bunch of examples combining the above two solutions to show how we internally evaluate chains and agents when we are developing.
-In addition to the examples we've curated, we also highly welcome contributions here.
-To facilitate that, we've included a [template notebook](/docs/guides/evaluation/benchmarking_template.html) for community members to use to build their own examples.
-
-The existing examples we have are:
-
-[Question Answering (State of Union)](/docs/guides/evaluation/qa_benchmarking_sota.html): A notebook showing evaluation of a question-answering task over a State-of-the-Union address.
-
-[Question Answering (Paul Graham Essay)](/docs/guides/evaluation/qa_benchmarking_pg.html): A notebook showing evaluation of a question-answering task over a Paul Graham essay.
-
-[SQL Question Answering (Chinook)](/docs/guides/evaluation/sql_qa_benchmarking_chinook.html): A notebook showing evaluation of a question-answering task over a SQL database (the Chinook database).
-
-[Agent Vectorstore](/docs/guides/evaluation/agent_vectordb_sota_pg.html): A notebook showing evaluation of an agent doing question answering while routing between two different vector databases.
-
-[Agent Search + Calculator](/docs/guides/evaluation/agent_benchmarking.html): A notebook showing evaluation of an agent doing question answering using a Search engine and a Calculator as tools.
-
-[Evaluating an OpenAPI Chain](/docs/guides/evaluation/openapi_eval.html): A notebook showing evaluation of an OpenAPI chain, including how to generate test data if you don't have any.
-
-
-## Other Examples
-
-In addition, we also have some more generic resources for evaluation.
-
-[Question Answering](/docs/guides/evaluation/question_answering.html): An overview of LLMs aimed at evaluating question answering systems in general.
-
-[Data Augmented Question Answering](/docs/guides/evaluation/data_augmented_question_answering.html): An end-to-end example of evaluating a question answering system focused on a specific document (a RetrievalQAChain to be precise). This example highlights how to use LLMs to come up with question/answer examples to evaluate over, and then highlights how to use LLMs to evaluate performance on those generated examples.
-
-[Hugging Face Datasets](/docs/guides/evaluation/huggingface_datasets.html): Covers an example of loading and using a dataset from Hugging Face for evaluation.
-
--- a/docs/extras/guides/evaluation/llm_math.ipynb
+++ b/docs/extras/guides/evaluation/llm_math.ipynb
@@ -1,308 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "a4734146",
-   "metadata": {},
-   "source": [
-    "# LLM Math\n",
-    "\n",
-    "Evaluating chains that know how to do math."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "fdd7afae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ce05ffea",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d028a511cede4de2b845b9a9954d6bea",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset json/LangChainDatasets--llm-math to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a71c8e5a21dd4da5a20a354b544f7a58",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ae530ca624154a1a934075c47d1093a6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0.00/631 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7a4968df05d84bc483aa2c5039aecafe",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split: 0 examples [00:00, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9a2caed96225410fb1cc0f8f155eb766",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"llm-math\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a998d6f",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to create some pipelines for doing math."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7078f7f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import LLMMathChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "2bd70c46",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "954c3270",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = LLMMathChain(llm=llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "f252027e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "c8af7041",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "numeric_output = [float(p[\"answer\"].strip().strip(\"Answer: \")) for p in predictions]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "cc09ffe4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "correct = [example[\"answer\"] == numeric_output[i] for i, example in enumerate(dataset)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "585244e4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.0"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sum(correct) / len(correct)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "0d14ac78",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "input:  5\n",
-      "expected output : 5.0\n",
-      "prediction:  5.0\n",
-      "input:  5 + 3\n",
-      "expected output : 8.0\n",
-      "prediction:  8.0\n",
-      "input:  2^3.171\n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:    2 ^3.171 \n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:  two to the power of three point one hundred seventy one\n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:  five + three squared minus 1\n",
-      "expected output : 13.0\n",
-      "prediction:  13.0\n",
-      "input:  2097 times 27.31\n",
-      "expected output : 57269.07\n",
-      "prediction:  57269.07\n",
-      "input:  two thousand ninety seven times twenty seven point thirty one\n",
-      "expected output : 57269.07\n",
-      "prediction:  57269.07\n",
-      "input:  209758 / 2714\n",
-      "expected output : 77.28739867354459\n",
-      "prediction:  77.28739867354459\n",
-      "input:  209758.857 divided by 2714.31\n",
-      "expected output : 77.27888745205964\n",
-      "prediction:  77.27888745205964\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i, example in enumerate(dataset):\n",
-    "    print(\"input: \", example[\"question\"])\n",
-    "    print(\"expected output :\", example[\"answer\"])\n",
-    "    print(\"prediction: \", numeric_output[i])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9021ffd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/openapi_eval.ipynb
+++ b/docs/extras/guides/evaluation/openapi_eval.ipynb
@@ -1,975 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "692f3256",
-   "metadata": {},
-   "source": [
-    "# Evaluating an OpenAPI Chain\n",
-    "\n",
-    "This notebook goes over ways to semantically evaluate an [OpenAPI Chain](/docs/modules/chains/additional/openapi.html), which calls an endpoint defined by the OpenAPI specification using purely natural language."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "a457106d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.tools import OpenAPISpec, APIOperation\n",
-    "from langchain.chains import OpenAPIEndpointChain, LLMChain\n",
-    "from langchain.requests import Requests\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2c3b0954",
-   "metadata": {},
-   "source": [
-    "## Load the API Chain\n",
-    "\n",
-    "Load a wrapper of the spec (so we can work with it more easily). You can load from a url or from a local file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "794142ba",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Load and parse the OpenAPI Spec\n",
-    "spec = OpenAPISpec.from_url(\n",
-    "    \"https://www.klarna.com/us/shopping/public/openai/v0/api-docs/\"\n",
-    ")\n",
-    "# Load a single endpoint operation\n",
-    "operation = APIOperation.from_openapi_spec(spec, \"/public/openai/v0/products\", \"get\")\n",
-    "verbose = False\n",
-    "# Select any LangChain LLM\n",
-    "llm = OpenAI(temperature=0, max_tokens=1000)\n",
-    "# Create the endpoint chain\n",
-    "api_chain = OpenAPIEndpointChain.from_api_operation(\n",
-    "    operation,\n",
-    "    llm,\n",
-    "    requests=Requests(),\n",
-    "    verbose=verbose,\n",
-    "    return_intermediate_steps=True,  # Return request and response text\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6c05ba5b",
-   "metadata": {},
-   "source": [
-    "### *Optional*: Generate Input Questions and Request Ground Truth Queries\n",
-    "\n",
-    "See [Generating Test Datasets](#Generating-Test-Datasets) at the end of this notebook for more details."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "a0c0cb7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import re\n",
-    "# from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "# template = \"\"\"Below is a service description:\n",
-    "\n",
-    "# {spec}\n",
-    "\n",
-    "# Imagine you're a new user trying to use {operation} through a search bar. What are 10 different things you want to request?\n",
-    "# Wants/Questions:\n",
-    "# 1. \"\"\"\n",
-    "\n",
-    "# prompt = PromptTemplate.from_template(template)\n",
-    "\n",
-    "# generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
-    "\n",
-    "# questions_ = generation_chain.run(spec=operation.to_typescript(), operation=operation.operation_id).split('\\n')\n",
-    "# # Strip preceding numeric bullets\n",
-    "# questions = [re.sub(r'^\\d+\\. ', '', q).strip() for q in questions_]\n",
-    "# questions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "f3d767ef",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ground_truths = [\n",
-    "# {\"q\": ...} # What are the best queries for each input?\n",
-    "# ]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "81098a05",
-   "metadata": {},
-   "source": [
-    "## Run the API Chain\n",
-    "\n",
-    "The two simplest questions a user of the API Chain are:\n",
-    "- Did the chain succesfully access the endpoint?\n",
-    "- Did the action accomplish the correct result?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "64bc7ed9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from collections import defaultdict\n",
-    "\n",
-    "# Collect metrics to report at completion\n",
-    "scores = defaultdict(list)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "dfd2d09f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--openapi-chain-klarna-products-get-5d03362007667626/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "10932c9c139941d1a8be1a798f29e923",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"openapi-chain-klarna-products-get\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "e08191a7",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'question': 'What iPhone models are available?',\n",
-       "  'expected_query': {'max_price': None, 'q': 'iPhone'}},\n",
-       " {'question': 'Are there any budget laptops?',\n",
-       "  'expected_query': {'max_price': 300, 'q': 'laptop'}},\n",
-       " {'question': 'Show me the cheapest gaming PC.',\n",
-       "  'expected_query': {'max_price': 500, 'q': 'gaming pc'}},\n",
-       " {'question': 'Are there any tablets under $400?',\n",
-       "  'expected_query': {'max_price': 400, 'q': 'tablet'}},\n",
-       " {'question': 'What are the best headphones?',\n",
-       "  'expected_query': {'max_price': None, 'q': 'headphones'}},\n",
-       " {'question': 'What are the top rated laptops?',\n",
-       "  'expected_query': {'max_price': None, 'q': 'laptop'}},\n",
-       " {'question': 'I want to buy some shoes. I like Adidas and Nike.',\n",
-       "  'expected_query': {'max_price': None, 'q': 'shoe'}},\n",
-       " {'question': 'I want to buy a new skirt',\n",
-       "  'expected_query': {'max_price': None, 'q': 'skirt'}},\n",
-       " {'question': 'My company is asking me to get a professional Deskopt PC - money is no object.',\n",
-       "  'expected_query': {'max_price': 10000, 'q': 'professional desktop PC'}},\n",
-       " {'question': 'What are the best budget cameras?',\n",
-       "  'expected_query': {'max_price': 300, 'q': 'camera'}}]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "7ee71384",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "questions = [d[\"question\"] for d in dataset]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "00511f7a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Run the the API chain itself\n",
-    "raise_error = False  # Stop on first failed example - useful for development\n",
-    "chain_outputs = []\n",
-    "failed_examples = []\n",
-    "for question in questions:\n",
-    "    try:\n",
-    "        chain_outputs.append(api_chain(question))\n",
-    "        scores[\"completed\"].append(1.0)\n",
-    "    except Exception as e:\n",
-    "        if raise_error:\n",
-    "            raise e\n",
-    "        failed_examples.append({\"q\": question, \"error\": e})\n",
-    "        scores[\"completed\"].append(0.0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f3c9729f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# If the chain failed to run, show the failing examples\n",
-    "failed_examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "914e7587",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['There are currently 10 Apple iPhone models available: Apple iPhone 14 Pro Max 256GB, Apple iPhone 12 128GB, Apple iPhone 13 128GB, Apple iPhone 14 Pro 128GB, Apple iPhone 14 Pro 256GB, Apple iPhone 14 Pro Max 128GB, Apple iPhone 13 Pro Max 128GB, Apple iPhone 14 128GB, Apple iPhone 12 Pro 512GB, and Apple iPhone 12 mini 64GB.',\n",
-       " 'Yes, there are several budget laptops in the API response. For example, the HP 14-dq0055dx and HP 15-dw0083wm are both priced at $199.99 and $244.99 respectively.',\n",
-       " 'The cheapest gaming PC available is the Alarco Gaming PC (X_BLACK_GTX750) for $499.99. You can find more information about it here: https://www.klarna.com/us/shopping/pl/cl223/3203154750/Desktop-Computers/Alarco-Gaming-PC-%28X_BLACK_GTX750%29/?utm_source=openai&ref-site=openai_plugin',\n",
-       " 'Yes, there are several tablets under $400. These include the Apple iPad 10.2\" 32GB (2019), Samsung Galaxy Tab A8 10.5 SM-X200 32GB, Samsung Galaxy Tab A7 Lite 8.7 SM-T220 32GB, Amazon Fire HD 8\" 32GB (10th Generation), and Amazon Fire HD 10 32GB.',\n",
-       " 'It looks like you are looking for the best headphones. Based on the API response, it looks like the Apple AirPods Pro (2nd generation) 2022, Apple AirPods Max, and Bose Noise Cancelling Headphones 700 are the best options.',\n",
-       " 'The top rated laptops based on the API response are the Apple MacBook Pro (2021) M1 Pro 8C CPU 14C GPU 16GB 512GB SSD 14\", Apple MacBook Pro (2022) M2 OC 10C GPU 8GB 256GB SSD 13.3\", Apple MacBook Air (2022) M2 OC 8C GPU 8GB 256GB SSD 13.6\", and Apple MacBook Pro (2023) M2 Pro OC 16C GPU 16GB 512GB SSD 14.2\".',\n",
-       " \"I found several Nike and Adidas shoes in the API response. Here are the links to the products: Nike Dunk Low M - Black/White: https://www.klarna.com/us/shopping/pl/cl337/3200177969/Shoes/Nike-Dunk-Low-M-Black-White/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 4 Retro M - Midnight Navy: https://www.klarna.com/us/shopping/pl/cl337/3202929835/Shoes/Nike-Air-Jordan-4-Retro-M-Midnight-Navy/?utm_source=openai&ref-site=openai_plugin, Nike Air Force 1 '07 M - White: https://www.klarna.com/us/shopping/pl/cl337/3979297/Shoes/Nike-Air-Force-1-07-M-White/?utm_source=openai&ref-site=openai_plugin, Nike Dunk Low W - White/Black: https://www.klarna.com/us/shopping/pl/cl337/3200134705/Shoes/Nike-Dunk-Low-W-White-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 1 Retro High M - White/University Blue/Black: https://www.klarna.com/us/shopping/pl/cl337/3200383658/Shoes/Nike-Air-Jordan-1-Retro-High-M-White-University-Blue-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 1 Retro High OG M - True Blue/Cement Grey/White: https://www.klarna.com/us/shopping/pl/cl337/3204655673/Shoes/Nike-Air-Jordan-1-Retro-High-OG-M-True-Blue-Cement-Grey-White/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 11 Retro Cherry - White/Varsity Red/Black: https://www.klarna.com/us/shopping/pl/cl337/3202929696/Shoes/Nike-Air-Jordan-11-Retro-Cherry-White-Varsity-Red-Black/?utm_source=openai&ref-site=openai_plugin, Nike Dunk High W - White/Black: https://www.klarna.com/us/shopping/pl/cl337/3201956448/Shoes/Nike-Dunk-High-W-White-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 5 Retro M - Black/Taxi/Aquatone: https://www.klarna.com/us/shopping/pl/cl337/3204923084/Shoes/Nike-Air-Jordan-5-Retro-M-Black-Taxi-Aquatone/?utm_source=openai&ref-site=openai_plugin, Nike Court Legacy Lift W: https://www.klarna.com/us/shopping/pl/cl337/3202103728/Shoes/Nike-Court-Legacy-Lift-W/?utm_source=openai&ref-site=openai_plugin\",\n",
-       " \"I found several skirts that may interest you. Please take a look at the following products: Avenue Plus Size Denim Stretch Skirt, LoveShackFancy Ruffled Mini Skirt - Antique White, Nike Dri-Fit Club Golf Skirt - Active Pink, Skims Soft Lounge Ruched Long Skirt, French Toast Girl's Front Pleated Skirt with Tabs, Alexia Admor Women's Harmonie Mini Skirt Pink Pink, Vero Moda Long Skirt, Nike Court Dri-FIT Victory Flouncy Tennis Skirt Women - White/Black, Haoyuan Mini Pleated Skirts W, and Zimmermann Lyre Midi Skirt.\",\n",
-       " 'Based on the API response, you may want to consider the Skytech Archangel Gaming Computer PC Desktop, the CyberPowerPC Gamer Master Gaming Desktop, or the ASUS ROG Strix G10DK-RS756, as they all offer powerful processors and plenty of RAM.',\n",
-       " 'Based on the API response, the best budget cameras are the DJI Mini 2 Dog Camera ($448.50), Insta360 Sphere with Landing Pad ($429.99), DJI FPV Gimbal Camera ($121.06), Parrot Camera & Body ($36.19), and DJI FPV Air Unit ($179.00).']"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "answers = [res[\"output\"] for res in chain_outputs]\n",
-    "answers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "484f0587",
-   "metadata": {},
-   "source": [
-    "## Evaluate the requests chain\n",
-    "\n",
-    "The API Chain has two main components:\n",
-    "1. Translate the user query to an API request (request synthesizer)\n",
-    "2. Translate the API response to a natural language response\n",
-    "\n",
-    "Here, we construct an evaluation chain to grade the request synthesizer against selected human queries "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "3ea5afd7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "truth_queries = [json.dumps(data[\"expected_query\"]) for data in dataset]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "e055f24b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Collect the API queries generated by the chain\n",
-    "predicted_queries = [\n",
-    "    output[\"intermediate_steps\"][\"request_args\"] for output in chain_outputs\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "7d4f2b88",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "template = \"\"\"You are trying to answer the following question by querying an API:\n",
-    "\n",
-    "> Question: {question}\n",
-    "\n",
-    "The query you know you should be executing against the API is:\n",
-    "\n",
-    "> Query: {truth_query}\n",
-    "\n",
-    "Is the following predicted query semantically the same (eg likely to produce the same answer)?\n",
-    "\n",
-    "> Predicted Query: {predict_query}\n",
-    "\n",
-    "Please give the Predicted Query a grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: <the letter>'\n",
-    "\n",
-    "> Explanation: Let's think step by step.\"\"\"\n",
-    "\n",
-    "prompt = PromptTemplate.from_template(template)\n",
-    "\n",
-    "eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "8cc1b1db",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
-       " ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n",
-       " \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n",
-       " ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
-       " ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n",
-       " \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n",
-       " ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n",
-       " ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n",
-       " ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n",
-       " ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F']"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "request_eval_results = []\n",
-    "for question, predict_query, truth_query in list(\n",
-    "    zip(questions, predicted_queries, truth_queries)\n",
-    "):\n",
-    "    eval_output = eval_chain.run(\n",
-    "        question=question,\n",
-    "        truth_query=truth_query,\n",
-    "        predict_query=predict_query,\n",
-    "    )\n",
-    "    request_eval_results.append(eval_output)\n",
-    "request_eval_results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "0d76f8ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re\n",
-    "from typing import List\n",
-    "\n",
-    "\n",
-    "# Parse the evaluation chain responses into a rubric\n",
-    "def parse_eval_results(results: List[str]) -> List[float]:\n",
-    "    rubric = {\"A\": 1.0, \"B\": 0.75, \"C\": 0.5, \"D\": 0.25, \"F\": 0}\n",
-    "    return [rubric[re.search(r\"Final Grade: (\\w+)\", res).group(1)] for res in results]\n",
-    "\n",
-    "\n",
-    "parsed_results = parse_eval_results(request_eval_results)\n",
-    "# Collect the scores for a final evaluation table\n",
-    "scores[\"request_synthesizer\"].extend(parsed_results)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6f3ee8ea",
-   "metadata": {},
-   "source": [
-    "## Evaluate the Response Chain\n",
-    "\n",
-    "The second component translated the structured API response to a natural language response.\n",
-    "Evaluate this against the user's original question."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "8b97847c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "template = \"\"\"You are trying to answer the following question by querying an API:\n",
-    "\n",
-    "> Question: {question}\n",
-    "\n",
-    "The API returned a response of:\n",
-    "\n",
-    "> API result: {api_response}\n",
-    "\n",
-    "Your response to the user: {answer}\n",
-    "\n",
-    "Please evaluate the accuracy and utility of your response to the user's original question, conditioned on the information available.\n",
-    "Give a letter grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: <the letter>'\n",
-    "\n",
-    "> Explanation: Let's think step by step.\"\"\"\n",
-    "\n",
-    "prompt = PromptTemplate.from_template(template)\n",
-    "\n",
-    "eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "642852ce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract the API responses from the chain\n",
-    "api_responses = [\n",
-    "    output[\"intermediate_steps\"][\"response_text\"] for output in chain_outputs\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "08a5eb4f",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
-       " ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n",
-       " \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n",
-       " ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
-       " ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n",
-       " \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n",
-       " ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n",
-       " ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n",
-       " ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n",
-       " ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F',\n",
-       " ' The user asked a question about what iPhone models are available, and the API returned a response with 10 different models. The response provided by the user accurately listed all 10 models, so the accuracy of the response is A+. The utility of the response is also A+ since the user was able to get the exact information they were looking for. Final Grade: A+',\n",
-       " \" The API response provided a list of laptops with their prices and attributes. The user asked if there were any budget laptops, and the response provided a list of laptops that are all priced under $500. Therefore, the response was accurate and useful in answering the user's question. Final Grade: A\",\n",
-       " \" The API response provided the name, price, and URL of the product, which is exactly what the user asked for. The response also provided additional information about the product's attributes, which is useful for the user to make an informed decision. Therefore, the response is accurate and useful. Final Grade: A\",\n",
-       " \" The API response provided a list of tablets that are under $400. The response accurately answered the user's question. Additionally, the response provided useful information such as the product name, price, and attributes. Therefore, the response was accurate and useful. Final Grade: A\",\n",
-       " \" The API response provided a list of headphones with their respective prices and attributes. The user asked for the best headphones, so the response should include the best headphones based on the criteria provided. The response provided a list of headphones that are all from the same brand (Apple) and all have the same type of headphone (True Wireless, In-Ear). This does not provide the user with enough information to make an informed decision about which headphones are the best. Therefore, the response does not accurately answer the user's question. Final Grade: F\",\n",
-       " ' The API response provided a list of laptops with their attributes, which is exactly what the user asked for. The response provided a comprehensive list of the top rated laptops, which is what the user was looking for. The response was accurate and useful, providing the user with the information they needed. Final Grade: A',\n",
-       " ' The API response provided a list of shoes from both Adidas and Nike, which is exactly what the user asked for. The response also included the product name, price, and attributes for each shoe, which is useful information for the user to make an informed decision. The response also included links to the products, which is helpful for the user to purchase the shoes. Therefore, the response was accurate and useful. Final Grade: A',\n",
-       " \" The API response provided a list of skirts that could potentially meet the user's needs. The response also included the name, price, and attributes of each skirt. This is a great start, as it provides the user with a variety of options to choose from. However, the response does not provide any images of the skirts, which would have been helpful for the user to make a decision. Additionally, the response does not provide any information about the availability of the skirts, which could be important for the user. \\n\\nFinal Grade: B\",\n",
-       " ' The user asked for a professional desktop PC with no budget constraints. The API response provided a list of products that fit the criteria, including the Skytech Archangel Gaming Computer PC Desktop, the CyberPowerPC Gamer Master Gaming Desktop, and the ASUS ROG Strix G10DK-RS756. The response accurately suggested these three products as they all offer powerful processors and plenty of RAM. Therefore, the response is accurate and useful. Final Grade: A',\n",
-       " \" The API response provided a list of cameras with their prices, which is exactly what the user asked for. The response also included additional information such as features and memory cards, which is not necessary for the user's question but could be useful for further research. The response was accurate and provided the user with the information they needed. Final Grade: A\"]"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Run the grader chain\n",
-    "response_eval_results = []\n",
-    "for question, api_response, answer in list(zip(questions, api_responses, answers)):\n",
-    "    request_eval_results.append(\n",
-    "        eval_chain.run(question=question, api_response=api_response, answer=answer)\n",
-    "    )\n",
-    "request_eval_results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "a144aa9d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Reusing the rubric from above, parse the evaluation chain responses\n",
-    "parsed_response_results = parse_eval_results(request_eval_results)\n",
-    "# Collect the scores for a final evaluation table\n",
-    "scores[\"result_synthesizer\"].extend(parsed_response_results)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "e95042bc",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Metric              \tMin       \tMean      \tMax       \n",
-      "completed           \t1.00      \t1.00      \t1.00      \n",
-      "request_synthesizer \t0.00      \t0.23      \t1.00      \n",
-      "result_synthesizer  \t0.00      \t0.55      \t1.00      \n"
-     ]
-    }
-   ],
-   "source": [
-    "# Print out Score statistics for the evaluation session\n",
-    "header = \"{:<20}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Min\", \"Mean\", \"Max\")\n",
-    "print(header)\n",
-    "for metric, metric_scores in scores.items():\n",
-    "    mean_scores = (\n",
-    "        sum(metric_scores) / len(metric_scores)\n",
-    "        if len(metric_scores) > 0\n",
-    "        else float(\"nan\")\n",
-    "    )\n",
-    "    row = \"{:<20}\\t{:<10.2f}\\t{:<10.2f}\\t{:<10.2f}\".format(\n",
-    "        metric, min(metric_scores), mean_scores, max(metric_scores)\n",
-    "    )\n",
-    "    print(row)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "03fe96af",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Re-show the examples for which the chain failed to complete\n",
-    "failed_examples"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2bb3636d",
-   "metadata": {},
-   "source": [
-    "## Generating Test Datasets\n",
-    "\n",
-    "To evaluate a chain against your own endpoint, you'll want to generate a test dataset that's conforms to the API.\n",
-    "\n",
-    "This section provides an overview of how to bootstrap the process.\n",
-    "\n",
-    "First, we'll parse the OpenAPI Spec. For this example, we'll [Speak](https://www.speak.com/)'s OpenAPI specification."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "a453eb93",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n",
-      "Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Load and parse the OpenAPI Spec\n",
-    "spec = OpenAPISpec.from_url(\"https://api.speak.com/openapi.yaml\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "bb65ffe8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['/v1/public/openai/explain-phrase',\n",
-       " '/v1/public/openai/explain-task',\n",
-       " '/v1/public/openai/translate']"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# List the paths in the OpenAPI Spec\n",
-    "paths = sorted(spec.paths.keys())\n",
-    "paths"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "0988f01b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['post']"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# See which HTTP Methods are available for a given path\n",
-    "methods = spec.get_methods_for_path(\"/v1/public/openai/explain-task\")\n",
-    "methods"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "e9ef0a77",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "type explainTask = (_: {\n",
-      "/* Description of the task that the user wants to accomplish or do. For example, \"tell the waiter they messed up my order\" or \"compliment someone on their shirt\" */\n",
-      "  task_description?: string,\n",
-      "/* The foreign language that the user is learning and asking about. The value can be inferred from question - for example, if the user asks \"how do i ask a girl out in mexico city\", the value should be \"Spanish\" because of Mexico City. Always use the full name of the language (e.g. Spanish, French). */\n",
-      "  learning_language?: string,\n",
-      "/* The user's native language. Infer this value from the language the user asked their question in. Always use the full name of the language (e.g. Spanish, French). */\n",
-      "  native_language?: string,\n",
-      "/* A description of any additional context in the user's question that could affect the explanation - e.g. setting, scenario, situation, tone, speaking style and formality, usage notes, or any other qualifiers. */\n",
-      "  additional_context?: string,\n",
-      "/* Full text of the user's question. */\n",
-      "  full_query?: string,\n",
-      "}) => any;\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Load a single endpoint operation\n",
-    "operation = APIOperation.from_openapi_spec(\n",
-    "    spec, \"/v1/public/openai/explain-task\", \"post\"\n",
-    ")\n",
-    "\n",
-    "# The operation can be serialized as typescript\n",
-    "print(operation.to_typescript())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "f1186b6d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compress the service definition to avoid leaking too much input structure to the sample data\n",
-    "template = \"\"\"In 20 words or less, what does this service accomplish?\n",
-    "{spec}\n",
-    "\n",
-    "Function: It's designed to \"\"\"\n",
-    "prompt = PromptTemplate.from_template(template)\n",
-    "generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
-    "purpose = generation_chain.run(spec=operation.to_typescript())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "a594406a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[\"Can you explain how to say 'hello' in Spanish?\",\n",
-       " \"I need help understanding the French word for 'goodbye'.\",\n",
-       " \"Can you tell me how to say 'thank you' in German?\",\n",
-       " \"I'm trying to learn the Italian word for 'please'.\",\n",
-       " \"Can you help me with the pronunciation of 'yes' in Portuguese?\",\n",
-       " \"I'm looking for the Dutch word for 'no'.\",\n",
-       " \"Can you explain the meaning of 'hello' in Japanese?\",\n",
-       " \"I need help understanding the Russian word for 'thank you'.\",\n",
-       " \"Can you tell me how to say 'goodbye' in Chinese?\",\n",
-       " \"I'm trying to learn the Arabic word for 'please'.\"]"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "template = \"\"\"Write a list of {num_to_generate} unique messages users might send to a service designed to{purpose} They must each be completely unique.\n",
-    "\n",
-    "1.\"\"\"\n",
-    "\n",
-    "\n",
-    "def parse_list(text: str) -> List[str]:\n",
-    "    # Match lines starting with a number then period\n",
-    "    # Strip leading and trailing whitespace\n",
-    "    matches = re.findall(r\"^\\d+\\. \", text)\n",
-    "    return [re.sub(r\"^\\d+\\. \", \"\", q).strip().strip('\"') for q in text.split(\"\\n\")]\n",
-    "\n",
-    "\n",
-    "num_to_generate = 10  # How many examples to use for this test set.\n",
-    "prompt = PromptTemplate.from_template(template)\n",
-    "generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
-    "text = generation_chain.run(purpose=purpose, num_to_generate=num_to_generate)\n",
-    "# Strip preceding numeric bullets\n",
-    "queries = parse_list(text)\n",
-    "queries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "8dc60f43",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n",
-       " '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n",
-       " '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n",
-       " '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n",
-       " '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n",
-       " '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n",
-       " '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n",
-       " '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n",
-       " '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n",
-       " '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Define the generation chain to get hypotheses\n",
-    "api_chain = OpenAPIEndpointChain.from_api_operation(\n",
-    "    operation,\n",
-    "    llm,\n",
-    "    requests=Requests(),\n",
-    "    verbose=verbose,\n",
-    "    return_intermediate_steps=True,  # Return request and response text\n",
-    ")\n",
-    "\n",
-    "predicted_outputs = [api_chain(query) for query in queries]\n",
-    "request_args = [\n",
-    "    output[\"intermediate_steps\"][\"request_args\"] for output in predicted_outputs\n",
-    "]\n",
-    "\n",
-    "# Show the generated request\n",
-    "request_args"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "b727e28e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## AI Assisted Correction\n",
-    "correction_template = \"\"\"Correct the following API request based on the user's feedback. If the user indicates no changes are needed, output the original without making any changes.\n",
-    "\n",
-    "REQUEST: {request}\n",
-    "\n",
-    "User Feedback / requested changes: {user_feedback}\n",
-    "\n",
-    "Finalized Request: \"\"\"\n",
-    "\n",
-    "prompt = PromptTemplate.from_template(correction_template)\n",
-    "correction_chain = LLMChain(llm=llm, prompt=prompt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "c1f4d71f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Query: Can you explain how to say 'hello' in Spanish?\n",
-      "Request: {\"task_description\": \"say 'hello'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say 'hello' in Spanish?\"}\n",
-      "Requested changes: \n",
-      "Query: I need help understanding the French word for 'goodbye'.\n",
-      "Request: {\"task_description\": \"understanding the French word for 'goodbye'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for 'goodbye'.\"}\n",
-      "Requested changes: \n",
-      "Query: Can you tell me how to say 'thank you' in German?\n",
-      "Request: {\"task_description\": \"say 'thank you'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'thank you' in German?\"}\n",
-      "Requested changes: \n",
-      "Query: I'm trying to learn the Italian word for 'please'.\n",
-      "Request: {\"task_description\": \"Learn the Italian word for 'please'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Italian word for 'please'.\"}\n",
-      "Requested changes: \n",
-      "Query: Can you help me with the pronunciation of 'yes' in Portuguese?\n",
-      "Request: {\"task_description\": \"Help with pronunciation of 'yes' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of 'yes' in Portuguese?\"}\n",
-      "Requested changes: \n",
-      "Query: I'm looking for the Dutch word for 'no'.\n",
-      "Request: {\"task_description\": \"Find the Dutch word for 'no'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I'm looking for the Dutch word for 'no'.\"}\n",
-      "Requested changes: \n",
-      "Query: Can you explain the meaning of 'hello' in Japanese?\n",
-      "Request: {\"task_description\": \"Explain the meaning of 'hello' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of 'hello' in Japanese?\"}\n",
-      "Requested changes: \n",
-      "Query: I need help understanding the Russian word for 'thank you'.\n",
-      "Request: {\"task_description\": \"understanding the Russian word for 'thank you'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for 'thank you'.\"}\n",
-      "Requested changes: \n",
-      "Query: Can you tell me how to say 'goodbye' in Chinese?\n",
-      "Request: {\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'goodbye' in Chinese?\"}\n",
-      "Requested changes: \n",
-      "Query: I'm trying to learn the Arabic word for 'please'.\n",
-      "Request: {\"task_description\": \"Learn the Arabic word for 'please'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Arabic word for 'please'.\"}\n",
-      "Requested changes: \n"
-     ]
-    }
-   ],
-   "source": [
-    "ground_truth = []\n",
-    "for query, request_arg in list(zip(queries, request_args)):\n",
-    "    feedback = input(f\"Query: {query}\\nRequest: {request_arg}\\nRequested changes: \")\n",
-    "    if feedback == \"n\" or feedback == \"none\" or not feedback:\n",
-    "        ground_truth.append(request_arg)\n",
-    "        continue\n",
-    "    resolved = correction_chain.run(request=request_arg, user_feedback=feedback)\n",
-    "    ground_truth.append(resolved.strip())\n",
-    "    print(\"Updated request:\", resolved)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "19d68882",
-   "metadata": {},
-   "source": [
-    "**Now you can use the `ground_truth` as shown above in [Evaluate the Requests Chain](#Evaluate-the-requests-chain)!**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "5a596176",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n",
-       " '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n",
-       " '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n",
-       " '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n",
-       " '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n",
-       " '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n",
-       " '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n",
-       " '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n",
-       " '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n",
-       " '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Now you have a new ground truth set to use as shown above!\n",
-    "ground_truth"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b7fe9dfa",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/qa_benchmarking_pg.ipynb
+++ b/docs/extras/guides/evaluation/qa_benchmarking_pg.ipynb
@@ -1,385 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "# Question Answering Benchmarking: Paul Graham Essay\n",
-    "\n",
-    "Here we go over how to benchmark performance on a question answering task over a Paul Graham essay.\n",
-    "\n",
-    "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3bd13ab7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "5b2d5e98",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--question-answering-paul-graham-76e8f711e038d742/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9264acfe710b4faabf060f0fcf4f7308",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"question-answering-paul-graham\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4ab6a716",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to create some pipelines for doing question answering. Step one in that is creating an index over the data in question."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "c18680b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../modules/paul_graham_essay.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "7f0de2b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.indexes import VectorstoreIndexCreator"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "ef84ff99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running Chroma using direct local API.\n",
-      "Using DuckDB in-memory for database. Data will be transient.\n"
-     ]
-    }
-   ],
-   "source": [
-    "vectorstore = VectorstoreIndexCreator().from_loaders([loader]).vectorstore"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0b5d8f6",
-   "metadata": {},
-   "source": [
-    "Now we can create a question answering chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "8843cb0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chains import RetrievalQA\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "573719a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = RetrievalQA.from_chain_type(\n",
-    "    llm=OpenAI(),\n",
-    "    chain_type=\"stuff\",\n",
-    "    retriever=vectorstore.as_retriever(),\n",
-    "    input_key=\"question\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "53b5aa23",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "3f81d951",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What were the two main things the author worked on before college?',\n",
-       " 'answer': 'The two main things the author worked on before college were writing and programming.',\n",
-       " 'result': ' Writing and programming.'}"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chain(dataset[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "24b4c66e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(dataset)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "49d969fb",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "Now we can evaluate the predictions. The first thing we can do is look at them by eye."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "1d583f03",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What were the two main things the author worked on before college?',\n",
-       " 'answer': 'The two main things the author worked on before college were writing and programming.',\n",
-       " 'result': ' Writing and programming.'}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "Next, we can use a language model to score them programatically"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "d0a9341d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "1612dec1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "79587806",
-   "metadata": {},
-   "source": [
-    "We can add in the graded output to the `predictions` dict and then get a count of the grades."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "2a689df5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i, prediction in enumerate(predictions):\n",
-    "    prediction[\"grade\"] = graded_outputs[i][\"text\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "27b61215",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({' CORRECT': 12, ' INCORRECT': 10})"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "Counter([pred[\"grade\"] for pred in predictions])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12fe30f4",
-   "metadata": {},
-   "source": [
-    "We can also filter the datapoints to the incorrect examples and look at them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "47c692a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "0ef976c1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What did the author write their dissertation on?',\n",
-       " 'answer': 'The author wrote their dissertation on applications of continuations.',\n",
-       " 'result': ' The author does not mention what their dissertation was on, so it is not known.',\n",
-       " 'grade': ' INCORRECT'}"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "incorrect[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7710401a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/qa_benchmarking_sota.ipynb
+++ b/docs/extras/guides/evaluation/qa_benchmarking_sota.ipynb
@@ -1,385 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "# Question Answering Benchmarking: State of the Union Address\n",
-    "\n",
-    "Here we go over how to benchmark performance on a question answering task over a state of the union address.\n",
-    "\n",
-    "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "f127fb04",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "5b2d5e98",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--question-answering-state-of-the-union-a7e5a3b2db4f440d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"question-answering-state-of-the-union\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4ab6a716",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to create some pipelines for doing question answering. Step one in that is creating an index over the data in question."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "c18680b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "7f0de2b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.indexes import VectorstoreIndexCreator"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "ef84ff99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running Chroma using direct local API.\n",
-      "Using DuckDB in-memory for database. Data will be transient.\n"
-     ]
-    }
-   ],
-   "source": [
-    "vectorstore = VectorstoreIndexCreator().from_loaders([loader]).vectorstore"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0b5d8f6",
-   "metadata": {},
-   "source": [
-    "Now we can create a question answering chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8843cb0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chains import RetrievalQA\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "573719a0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = RetrievalQA.from_chain_type(\n",
-    "    llm=OpenAI(),\n",
-    "    chain_type=\"stuff\",\n",
-    "    retriever=vectorstore.as_retriever(),\n",
-    "    input_key=\"question\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "37d669e9",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "3089e409",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the purpose of the NATO Alliance?',\n",
-       " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
-       " 'result': ' The NATO Alliance was created to secure peace and stability in Europe after World War 2.'}"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chain(dataset[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "24b4c66e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(dataset)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "49d969fb",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "Now we can evaluate the predictions. The first thing we can do is look at them by eye."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "1d583f03",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the purpose of the NATO Alliance?',\n",
-       " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
-       " 'result': ' The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "Next, we can use a language model to score them programatically"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "d0a9341d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "1612dec1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "79587806",
-   "metadata": {},
-   "source": [
-    "We can add in the graded output to the `predictions` dict and then get a count of the grades."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "2a689df5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i, prediction in enumerate(predictions):\n",
-    "    prediction[\"grade\"] = graded_outputs[i][\"text\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "27b61215",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({' CORRECT': 7, ' INCORRECT': 4})"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "Counter([pred[\"grade\"] for pred in predictions])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12fe30f4",
-   "metadata": {},
-   "source": [
-    "We can also filter the datapoints to the incorrect examples and look at them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "47c692a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "0ef976c1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the U.S. Department of Justice doing to combat the crimes of Russian oligarchs?',\n",
-       " 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs.',\n",
-       " 'result': ' The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and is naming a chief prosecutor for pandemic fraud.',\n",
-       " 'grade': ' INCORRECT'}"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "incorrect[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7710401a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/qa_generation.ipynb
+++ b/docs/extras/guides/evaluation/qa_generation.ipynb
@@ -1,118 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "ee2a3a21",
-   "metadata": {},
-   "source": [
-    "# QA Generation\n",
-    "This notebook shows how to use the `QAGenerationChain` to come up with question-answer pairs over a specific document.\n",
-    "This is important because often times you may not have data to evaluate your question-answer system over, so this is a cheap and lightweight way to generate it!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "33d3f0b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import TextLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "2029a29c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "87edb84c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "doc = loader.load()[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "04125b6d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.chains import QAGenerationChain\n",
-    "\n",
-    "chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "4f1593e4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "qa = chain.run(doc.page_content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "ee831f92",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'What is the U.S. Department of Justice doing to combat the crimes of Russian oligarchs?',\n",
-       " 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs.'}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "qa[1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7028754e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/question_answering.ipynb
+++ b/docs/extras/guides/evaluation/question_answering.ipynb
@@ -1,445 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "480b7cf8",
-   "metadata": {},
-   "source": [
-    "# Question Answering\n",
-    "\n",
-    "This notebook covers how to evaluate generic question answering problems. This is a situation where you have an example containing a question and its corresponding ground truth answer, and you want to measure how well the language model does at answering those questions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "78e3023b",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "For demonstration purposes, we will just evaluate a simple question answering system that only evaluates the model's internal knowledge. Please see other notebooks for examples where it evaluates how the model does at question answering over data not present in what the model was trained on."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "96710d50",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "from langchain.chains import LLMChain\n",
-    "from langchain.llms import OpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "e33ccf00",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompt = PromptTemplate(\n",
-    "    template=\"Question: {question}\\nAnswer:\", input_variables=[\"question\"]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "172d993a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(model_name=\"text-davinci-003\", temperature=0)\n",
-    "chain = LLMChain(llm=llm, prompt=prompt)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0c584440",
-   "metadata": {},
-   "source": [
-    "## Examples\n",
-    "For this purpose, we will just use two simple hardcoded examples, but see other notebooks for tips on how to get and/or generate these examples."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "87de1d84",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "examples = [\n",
-    "    {\n",
-    "        \"question\": \"Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?\",\n",
-    "        \"answer\": \"11\",\n",
-    "    },\n",
-    "    {\n",
-    "        \"question\": 'Is the following sentence plausible? \"Joao Moutinho caught the screen pass in the NFC championship.\"',\n",
-    "        \"answer\": \"No\",\n",
-    "    },\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "143b1155",
-   "metadata": {},
-   "source": [
-    "## Predictions\n",
-    "\n",
-    "We can now make and inspect the predictions for these questions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "c7bd809c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(examples)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "f06dceab",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'text': ' 11 tennis balls'},\n",
-       " {'text': ' No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.'}]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "45cc2f9d",
-   "metadata": {},
-   "source": [
-    "## Evaluation\n",
-    "\n",
-    "We can see that if we tried to just do exact match on the answer answers (`11` and `No`) they would not match what the language model answered. However, semantically the language model is correct in both cases. In order to account for this, we can use a language model itself to evaluate the answers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "0cacc65a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "5aa6cd65",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    examples, predictions, question_key=\"question\", prediction_key=\"text\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "63780020",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Example 0:\n",
-      "Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?\n",
-      "Real Answer: 11\n",
-      "Predicted Answer:  11 tennis balls\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n",
-      "Example 1:\n",
-      "Question: Is the following sentence plausible? \"Joao Moutinho caught the screen pass in the NFC championship.\"\n",
-      "Real Answer: No\n",
-      "Predicted Answer:  No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.\n",
-      "Predicted Grade:  CORRECT\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i, eg in enumerate(examples):\n",
-    "    print(f\"Example {i}:\")\n",
-    "    print(\"Question: \" + eg[\"question\"])\n",
-    "    print(\"Real Answer: \" + eg[\"answer\"])\n",
-    "    print(\"Predicted Answer: \" + predictions[i][\"text\"])\n",
-    "    print(\"Predicted Grade: \" + graded_outputs[i][\"text\"])\n",
-    "    print()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "782ae8c8",
-   "metadata": {},
-   "source": [
-    "## Customize Prompt\n",
-    "\n",
-    "You can also customize the prompt that is used. Here is an example prompting it using a score from 0 to 10.\n",
-    "The custom prompt requires 3 input variables: \"query\", \"answer\" and \"result\". Where \"query\" is the question, \"answer\" is the ground truth answer, and \"result\" is the predicted answer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "153425c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.prompts.prompt import PromptTemplate\n",
-    "\n",
-    "_PROMPT_TEMPLATE = \"\"\"You are an expert professor specialized in grading students' answers to questions.\n",
-    "You are grading the following question:\n",
-    "{query}\n",
-    "Here is the real answer:\n",
-    "{answer}\n",
-    "You are grading the following predicted answer:\n",
-    "{result}\n",
-    "What grade do you give from 0 to 10, where 0 is the lowest (very low similarity) and 10 is the highest (very high similarity)?\n",
-    "\"\"\"\n",
-    "\n",
-    "PROMPT = PromptTemplate(\n",
-    "    input_variables=[\"query\", \"answer\", \"result\"], template=_PROMPT_TEMPLATE\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a3b0fb7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "evalchain = QAEvalChain.from_llm(llm=llm, prompt=PROMPT)\n",
-    "evalchain.evaluate(\n",
-    "    examples,\n",
-    "    predictions,\n",
-    "    question_key=\"question\",\n",
-    "    answer_key=\"answer\",\n",
-    "    prediction_key=\"text\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cb1cf335",
-   "metadata": {},
-   "source": [
-    "## Evaluation without Ground Truth\n",
-    "Its possible to evaluate question answering systems without ground truth. You would need a `\"context\"` input that reflects what the information the LLM uses to answer the question. This context can be obtained by any retreival system. Here's an example of how it works:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c59293f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "context_examples = [\n",
-    "    {\n",
-    "        \"question\": \"How old am I?\",\n",
-    "        \"context\": \"I am 30 years old. I live in New York and take the train to work everyday.\",\n",
-    "    },\n",
-    "    {\n",
-    "        \"question\": 'Who won the NFC championship game in 2023?\"',\n",
-    "        \"context\": \"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
-    "    },\n",
-    "]\n",
-    "QA_PROMPT = \"Answer the question based on the  context\\nContext:{context}\\nQuestion:{question}\\nAnswer:\"\n",
-    "template = PromptTemplate(input_variables=[\"context\", \"question\"], template=QA_PROMPT)\n",
-    "qa_chain = LLMChain(llm=llm, prompt=template)\n",
-    "predictions = qa_chain.apply(context_examples)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "e500d0cc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'text': 'You are 30 years old.'},\n",
-       " {'text': ' The Philadelphia Eagles won the NFC championship game in 2023.'}]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "6d8cbc1d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import ContextQAEvalChain\n",
-    "\n",
-    "eval_chain = ContextQAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    context_examples, predictions, question_key=\"question\", prediction_key=\"text\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "6c5262d0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'text': ' CORRECT'}, {'text': ' CORRECT'}]"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "graded_outputs"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aaa61f0c",
-   "metadata": {},
-   "source": [
-    "## Comparing to other evaluation metrics\n",
-    "We can compare the evaluation results we get to other common evaluation metrics. To do this, let's load some evaluation metrics from HuggingFace's `evaluate` package."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "d851453b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Some data munging to get the examples in the right format\n",
-    "for i, eg in enumerate(examples):\n",
-    "    eg[\"id\"] = str(i)\n",
-    "    eg[\"answers\"] = {\"text\": [eg[\"answer\"]], \"answer_start\": [0]}\n",
-    "    predictions[i][\"id\"] = str(i)\n",
-    "    predictions[i][\"prediction_text\"] = predictions[i][\"text\"]\n",
-    "\n",
-    "for p in predictions:\n",
-    "    del p[\"text\"]\n",
-    "\n",
-    "new_examples = examples.copy()\n",
-    "for eg in new_examples:\n",
-    "    del eg[\"question\"]\n",
-    "    del eg[\"answer\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "c38eb3e9",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "from evaluate import load\n",
-    "\n",
-    "squad_metric = load(\"squad\")\n",
-    "results = squad_metric.compute(\n",
-    "    references=new_examples,\n",
-    "    predictions=predictions,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "07d68f85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'exact_match': 0.0, 'f1': 28.125}"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3b775150",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "53f3bc57609c7a84333bb558594977aa5b4026b1d6070b93987956689e367341"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/sql_qa_benchmarking_chinook.ipynb
+++ b/docs/extras/guides/evaluation/sql_qa_benchmarking_chinook.ipynb
@@ -1,428 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "984169ca",
-   "metadata": {},
-   "source": [
-    "# SQL Question Answering Benchmarking: Chinook\n",
-    "\n",
-    "Here we go over how to benchmark performance on a question answering task over a SQL database.\n",
-    "\n",
-    "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "44874486",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0f66405e",
-   "metadata": {},
-   "source": [
-    "## Loading the data\n",
-    "\n",
-    "First, let's load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "0df1393f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b220d07ee5d14909bc842b4545cdc0de",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset json/LangChainDatasets--sql-qa-chinook to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--sql-qa-chinook-7528565d2d992b47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e89e3c8ef76f49889c4b39c624828c71",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a8421df6c26045e8978c7086cb418222",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0.00/1.44k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d1fb6becc3324a85bf039a53caf30924",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split: 0 examples [00:00, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--sql-qa-chinook-7528565d2d992b47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9d68ad1b3e4a4bd79f92597aac4d3cc9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"sql-qa-chinook\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "ab44d504",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'How many employees are there?', 'answer': '8'}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a16b75d",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "This uses the example Chinook database.\n",
-    "To set it up follow the instructions on https://database.guide/2-sample-databases-sqlite/, placing the `.db` file in a notebooks folder at the root of this repository.\n",
-    "\n",
-    "Note that here we load a simple chain. If you want to experiment with more complex chains, or an agent, just create the `chain` object in a different way."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "5b2d5e98",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain import OpenAI, SQLDatabase, SQLDatabaseChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "33cdcbfc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "db = SQLDatabase.from_uri(\"sqlite:///../../../notebooks/Chinook.db\")\n",
-    "llm = OpenAI(temperature=0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f0b5d8f6",
-   "metadata": {},
-   "source": [
-    "Now we can create a SQL database chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "8843cb0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = SQLDatabaseChain.from_llm(llm, db, input_key=\"question\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6c0062e7",
-   "metadata": {},
-   "source": [
-    "## Make a prediction\n",
-    "\n",
-    "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "d28c5e7d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'How many employees are there?',\n",
-       " 'answer': '8',\n",
-       " 'result': ' There are 8 employees.'}"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chain(dataset[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d0c16cd7",
-   "metadata": {},
-   "source": [
-    "## Make many predictions\n",
-    "Now we can make predictions. Note that we add a try-except because this chain can sometimes error (if SQL is written incorrectly, etc)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "24b4c66e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = []\n",
-    "predicted_dataset = []\n",
-    "error_dataset = []\n",
-    "for data in dataset:\n",
-    "    try:\n",
-    "        predictions.append(chain(data))\n",
-    "        predicted_dataset.append(data)\n",
-    "    except:\n",
-    "        error_dataset.append(data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4783344b",
-   "metadata": {},
-   "source": [
-    "## Evaluate performance\n",
-    "Now we can evaluate the predictions. We can use a language model to score them programatically"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "d0a9341d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.qa import QAEvalChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "1612dec1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(temperature=0)\n",
-    "eval_chain = QAEvalChain.from_llm(llm)\n",
-    "graded_outputs = eval_chain.evaluate(\n",
-    "    predicted_dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "79587806",
-   "metadata": {},
-   "source": [
-    "We can add in the graded output to the `predictions` dict and then get a count of the grades."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "2a689df5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i, prediction in enumerate(predictions):\n",
-    "    prediction[\"grade\"] = graded_outputs[i][\"text\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "27b61215",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({' CORRECT': 3, ' INCORRECT': 4})"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "Counter([pred[\"grade\"] for pred in predictions])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "12fe30f4",
-   "metadata": {},
-   "source": [
-    "We can also filter the datapoints to the incorrect examples and look at them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "47c692a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "0ef976c1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': 'How many employees are also customers?',\n",
-       " 'answer': 'None',\n",
-       " 'result': ' 59 employees are also customers.',\n",
-       " 'grade': ' INCORRECT'}"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "incorrect[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7710401a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/modules/evaluation/comparison/custom.ipynb
+++ b/docs/extras/modules/evaluation/comparison/custom.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
+   "metadata": {},
+   "source": [
+    "# Custom Pairwise Evaluator\n",
+    "\n",
+    "You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class.\n",
+    "\n",
+    "In this example, you will create create a semantic similarity evaluator using the cosine distance between two outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Optional, Any\n",
+    "import numpy as np\n",
+    "\n",
+    "from langchain.evaluation import PairwiseStringEvaluator\n",
+    "from langchain.embeddings import OpenAIEmbeddings\n",
+    "from langchain.embeddings.base import Embeddings\n",
+    "from langchain.math_utils import cosine_similarity\n",
+    "\n",
+    "\n",
+    "class CosineSimilarityEvaluator(PairwiseStringEvaluator):\n",
+    "    \n",
+    "    def __init__(self, embeddings: Optional[Embeddings] = None) -> None:\n",
+    "        self.embeddings = embeddings or OpenAIEmbeddings()\n",
+    "    \n",
+    "    def _evaluate_string_pairs(\n",
+    "        self,\n",
+    "        *,\n",
+    "        prediction: str,\n",
+    "        prediction_b: str,\n",
+    "        reference: Optional[str] = None,\n",
+    "        input: Optional[str] = None,\n",
+    "        **kwargs: Any,\n",
+    "    ) -> dict:\n",
+    "        vectors = np.array(self.embeddings.embed_documents([prediction, prediction_b]))\n",
+    "        similarity = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))[0][0]\n",
+    "        return {\"score\": similarity}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "evaluator = CosineSimilarityEvaluator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 0.8886415076113652}"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluator.evaluate_string_pairs(\n",
+    "    prediction=\"London\",\n",
+    "    prediction_b=\"England\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "a83b90fd-6fd9-47e1-9b42-6565a6cfde52",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 0.8388476726914114}"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluator.evaluate_string_pairs(\n",
+    "    prediction=\"London\",\n",
+    "    prediction_b=\"France\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb
+++ b/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb
@@ -0,0 +1,148 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2da95378",
+   "metadata": {},
+   "source": [
+    "# Pairwise String Comparison\n",
+    "\n",
+    "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The comparison evaluators facilitate this so you can answer questions like:\n",
+    "- Which LLM or Prompt produces a preferred output for a given question?\n",
+    "- Which completions should I include for few-shot example selection?\n",
+    "- Which output is better to include for fintetuning?\n",
+    "\n",
+    "You can use the PairwiseStringEvalChain to do this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f6790c46",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import PairwiseStringEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
+    "\n",
+    "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "49ad9139",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference. Although Response B is less detailed, it is accurate and directly answers the question. \\n\\nTherefore, the better response is [[B]].\\n',\n",
+       " 'value': 'B',\n",
+       " 'score': 0}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_string_pairs(\n",
+    "    prediction = \"there are three dogs\",\n",
+    "    prediction_b=\"4\",\n",
+    "    input=\"how many dogs are in the park?\",\n",
+    "    reference=\"four\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed353b93-be71-4479-b9c0-8c97814c2e58",
+   "metadata": {},
+   "source": [
+    "## Without References\n",
+    "\n",
+    "When references aren't available, you can still predict the preferred response.\n",
+    "The results will reflect the evaluation model's preference, which is less reliable and may result\n",
+    "in preferences that are factually incorrect."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "586320da",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'Both responses answer the question directly and accurately, but neither provides any additional detail or context. Response A is slightly more complete because it uses a full sentence, while Response B only provides a number. However, both responses are relevant and accurate, so the difference is minimal.\\n\\nFinal decision: [[C]]\\n',\n",
+       " 'value': None,\n",
+       " 'score': 0.5}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_string_pairs(\n",
+    "    prediction = \"there are three dogs\",\n",
+    "    prediction_b=\"4\",\n",
+    "    input=\"What is the name of the dog?\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de84a958-1330-482b-b950-68bcf23f9e35",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/how_to/regression_testing.ipynb
+++ b/docs/extras/modules/evaluation/how_to/regression_testing.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0fedc3eb-58d3-4001-9d52-699905aed710",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Regression Testing\n",
+    "\n",
+    "When dealing with model API's, it can be hard to know if the prediction quality has changed without proper regression testing. This guide will touch on three easy ways\n",
+    "to regression test your model API's. We will use a QA system as an example. They all depend on constructing a dataset of inputs. It's best for inputs to be representative of your application domain.\n",
+    "\n",
+    "**Important:** As with any system, it's important to isolate what you want to test. If you are regression testing an LLM API, test it directly or mock other components of your application."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c66c2025-8569-4955-a50a-bb66bd39413e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.loading import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8095377-7751-4d1b-8303-051a48adc6c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = []"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b690d689-b338-4d74-8dbc-9debaaa6725d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Approach 1: Compare Aggregate Performance\n",
+    "\n",
+    "The first approach is to construct an example dataset with reference examples. You can test the accuracy (or other metrics) of your model on a schedule to ensure the accuracy of your model is not degrading."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ee582f1-de66-4544-99ef-3bf672c13a05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import  ChatOpenAI\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0631\", temperature=0)\n",
+    "# TODO"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7562c310-d80b-4461-96e0-d70bc94b3e9a",
+   "metadata": {},
+   "source": [
+    "## Approach 2: Pairwise Compare Outputs\n",
+    "\n",
+    "The second way you can track changes and regressions is to compare outputs of the model on identical inputs. You can use a simple exact (or fuzzy) string match metric\n",
+    "or use a model graded metric to ensure the meanings of the outputs are the same.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f47bdef5-7202-4523-b207-c0b6a7dd6da5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/string/criteria_eval_chain.ipynb
+++ b/docs/extras/modules/evaluation/string/criteria_eval_chain.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4cf569a7-9a1d-4489-934e-50e57760c907",
+   "metadata": {},
+   "source": [
+    "# Evaluating Custom Criteria\n",
+    "\n",
+    "Suppose you want to test a model's output against a custom rubric or custom set of criteria, how would you go about testing this?\n",
+    "\n",
+    "The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
+    "properly define those criteria.\n",
+    "\n",
+    "### Without References\n",
+    "\n",
+    "In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are \"concise\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6005ebe8-551e-47a5-b4df-80575a068552",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation.criteria import CriteriaEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "criterion = \"conciseness\"\n",
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "22f83fb8-82f4-4310-a877-68aaa0789199",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': 'The criterion is conciseness, which means the submission should be concise and to the point. \\n\\nLooking at the submission, the respondent has added unnecessary information such as \"That\\'s an elementary question\" and \"The answer you\\'re looking for is that\". The actual answer to the question \"What\\'s 2+2?\" is simply \"4\". \\n\\nTherefore, the submission is not concise and does not meet the criterion.\\n\\nN', 'value': 'N', 'score': 0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
+    "    input=\"What's 2+2?\",\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['conciseness',\n",
+       " 'relevance',\n",
+       " 'correctness',\n",
+       " 'coherence',\n",
+       " 'harmfulness',\n",
+       " 'maliciousness',\n",
+       " 'helpfulness',\n",
+       " 'controversiality',\n",
+       " 'mysogyny',\n",
+       " 'criminality',\n",
+       " 'insensitive']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
+    "CriteriaEvalChain.get_supported_default_criteria()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
+   "metadata": {},
+   "source": [
+    "## Using Reference Labels\n",
+    "\n",
+    "Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "With ground truth: 1\n",
+      "Withoutg ground truth: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\", requires_reference=True)\n",
+    "\n",
+    "# We can even override the model's learned knowledge using ground truth labels\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    input=\"What is the capital of the US?\",\n",
+    "    prediction=\"Topeka, KS\", \n",
+    "    reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\")\n",
+    "print(f'With ground truth: {eval_result[\"score\"]}')\n",
+    "\n",
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\")\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    input=\"What is the capital of the US?\",\n",
+    "    prediction=\"Topeka, KS\", \n",
+    ")\n",
+    "print(f'Without ground truth: {eval_result[\"score\"]}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2eb7dedb-913a-4d9e-b48a-9521425d1008",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Multiple Criteria\n",
+    "\n",
+    "To check whether an output complies with all of a list of default criteria, pass in a list! Be sure to only include criteria that are relevant to the provided information, and avoid mixing criteria that measure opposing things (e.g., harmfulness and helpfulness)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "50c067f7-bc6e-4d6c-ba34-97a72023be27",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"First, let's assess the submission based on the criterion of conciseness. The submission is not concise and to the point. The first part of the answer is correct, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and adds unnecessary confusion, making the answer not concise.\\n\\nSecond, let's evaluate the submission based on the criterion of coherence. The submission is not coherent, well-structured, and organized. The first part of the answer is coherent and well-structured, stating that the capital of the US is Washington D.C. However, the second part of the answer contradicts the first part and disrupts the coherence and structure of the answer.\\n\\nBased on the assessment of the submission against the criteria, the submission does not meet all the criteria.\\n\\nN\", 'value': 'N', 'score': 0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "criteria = [\"conciseness\", \"coherence\"]\n",
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"The capital of the US is Washington D.C. There is no capital.\", \n",
+    "    input=\"What is the capital of the US?\",\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "077c4715-e857-44a3-9f87-346642586a8d",
+   "metadata": {},
+   "source": [
+    "## Custom Criteria\n",
+    "\n",
+    "To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `\"criterion_name\": \"criterion_description\"`\n",
+    "\n",
+    "Note: the evaluator still predicts whether the output complies with ALL of the criteria provided. If you specify antagonistic criteria / antonyms, the evaluator won't be very useful."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bafa0a11-2617-4663-84bf-24df7d0736be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': 'The criterion asks if the output contains numeric information. The submission states \"The closest star is more than four light years away.\" The phrase \"more than four\" includes a numeric value, which is \"four\". Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
+     ]
+    }
+   ],
+   "source": [
+    "custom_criterion = {\n",
+    "    \"numeric\": \"Does the output contain numeric information?\"\n",
+    "}\n",
+    "\n",
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n",
+    "eval_result = eval_chain.evaluate_strings(\n",
+    "    prediction=\"The closest star is more than four light years away.\", \n",
+    "    input=\"How far away is the closest star?\",\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6db12a16-0058-4a14-8064-8528540963d8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Meets criteria:  1\n",
+      "Does not meet criteria:  0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# You can specify multiple criteria in the dictionary. We recommend you keep the number criteria to a minimum, however for more reliable results.\n",
+    "\n",
+    "custom_criteria = {\n",
+    "    \"complements-user\": \"Does the submission complements the question or the person writing the question in some way?\",\n",
+    "    \"positive\": \"Does the submission maintain a positive sentiment throughout?\",\n",
+    "    \"active voice\": \"Does the submission maintain an active voice throughout, avoiding state of being verbs?\",\n",
+    "}\n",
+    "\n",
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criteria)\n",
+    "\n",
+    "# Example that complies\n",
+    "query = \"What's the population of lagos?\"\n",
+    "eval_result = eval_chain.evaluate_strings(prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\", input=query)\n",
+    "print(\"Meets criteria: \", eval_result[\"score\"])\n",
+    "\n",
+    "# Example that does not comply\n",
+    "eval_result = eval_chain.evaluate_strings(prediction=\"The population of Lagos, Nigeria, is about 30 million people.\", input=query)\n",
+    "print(\"Does not meet criteria: \", eval_result[\"score\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
+   "metadata": {},
+   "source": [
+    "## Using Constitutional Principles\n",
+    "\n",
+    "Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to\n",
+    "instantiate the chain and take advantage of the many existing principles in LangChain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "99e3c242-5b12-4bd5-b487-64990a159655",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "54 available principles\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('harmful1',\n",
+       "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),\n",
+       " ('harmful2',\n",
+       "  ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),\n",
+       " ('harmful3',\n",
+       "  ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),\n",
+       " ('harmful4',\n",
+       "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),\n",
+       " ('insensitive',\n",
+       "  ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.chains.constitutional_ai.principles import PRINCIPLES\n",
+    "\n",
+    "print(f\"{len(PRINCIPLES)} available principles\")\n",
+    "list(PRINCIPLES.items())[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'Let\\'s assess the submission based on the provided criteria:\\n\\nCriterion harmful1: The assistant\\'s response is not harmful, unethical, racist, sexist, toxic, dangerous, or illegal. It is a negative opinion about \"Will\", but it does not cross any of these boundaries. It is not promoting harm or illegal activities, nor is it expressing any form of discrimination or bias.\\n\\nCriterion harmful2: The assistant\\'s response could potentially be harmful to \"Will\" if he takes the insult personally. However, it does not encourage dangerous or illegal activity, nor does it promote racism, sexism, or other social biases.\\n\\nBased on this analysis, the submission does not fully meet the criteria because it could potentially be harmful to \"Will\". However, it does not violate any of the other specific harmful behaviors listed in the criteria.\\n\\nN',\n",
+       " 'value': 'N',\n",
+       " 'score': 0}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]])\n",
+    "eval_result = eval_chain.evaluate_strings(prediction=\"I say that man is a lilly-livered nincompoop\", input=\"What do you think of Will?\")\n",
+    "eval_result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2662405-353a-4a73-b867-784d12cafcf1",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.\n",
+    "\n",
+    "Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "415eb393-c64f-41f1-98de-de99e8e3597e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/string/custom.ipynb
+++ b/docs/extras/modules/evaluation/string/custom.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4460f924-1738-4dc5-999f-c26383aba0a4",
+   "metadata": {},
+   "source": [
+    "# Custom String Evaluator\n",
+    "\n",
+    "You can make your own custom string evaluators by inheriting from the `StringEvaluator` class.\n",
+    "In this example, you will create a perplexity evaluator using the HuggingFace evaluate library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install evaluate > /dev/null"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Any, Optional\n",
+    "\n",
+    "from langchain.evaluation import StringEvaluator\n",
+    "from evaluate import load\n",
+    "\n",
+    "class PerplexityEvaluator(StringEvaluator):\n",
+    "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
+    "    \n",
+    "    def __init__(self,  model_id: str = \"gpt2\"):\n",
+    "        self.model_id = model_id\n",
+    "        self.metric_fn = load(\"perplexity\", module_type=\"metric\", model_id=self.model_id)\n",
+    "    \n",
+    "    def _evaluate_strings(\n",
+    "        self,\n",
+    "        *,\n",
+    "        prediction: str,\n",
+    "        reference: Optional[str] = None,\n",
+    "        input: Optional[str] = None,\n",
+    "        **kwargs: Any,\n",
+    "    ) -> dict:\n",
+    "        results = self.metric_fn.compute(predictions=[prediction], model_id=self.model_id)\n",
+    "        ppl = results['perplexities'][0]\n",
+    "        return {\"score\": ppl}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.50it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 1982.0709228515625}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluator = PerplexityEvaluator()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.87it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 190.3675537109375}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluator.evaluate_strings(\n",
+    "    prediction=\"The rains in Spain fall mainly on the plain.\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using pad_token, but it is not set yet.\n",
+      "100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 32.01it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'score': 1982.0709228515625}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluator.evaluate_strings(\n",
+    "    prediction=\"The rains in Spain fall mainly on LangChain.\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/string/qa.ipynb
+++ b/docs/extras/modules/evaluation/string/qa.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c701fcaf-e5dc-42a2-b8a7-027d13ff465f",
+   "metadata": {},
+   "source": [
+    "# QA Correctness\n",
+    "\n",
+    "The QAEvalChain compares a question-answering model's response to a reference response.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9672fdb9-b53f-41e4-8f72-f21d11edbeac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import QAEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "criterion = \"conciseness\"\n",
+    "eval_chain = QAEvalChain.from_llm(llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b4db474a-9c9d-473f-81b1-55070ee584a6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"What's last quarter's sales numbers?\",\n",
+    "    prediction=\"Last quarter we sold 600,000 total units of product.\",\n",
+    "    reference=\"Last quarter we sold 100,000 units of product A, 200,000 units of product B, and 300,000 units of product C.\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5b345aa-7f45-4eea-bedf-9b0d5e824be3",
+   "metadata": {},
+   "source": [
+    "## SQL Correctness\n",
+    "\n",
+    "You can use an LLM to check the equivalence of a SQL query against a reference SQL query. using the sql prompt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6c803b8c-fe1f-4fb7-8ea0-d9c67b855eb3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.qa.eval_prompt import SQL_PROMPT\n",
+    "\n",
+    "eval_chain = QAEvalChain.from_llm(llm=llm, prompt=SQL_PROMPT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e28b8d07-248f-405c-bcef-e0ebe3a05c3e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'The expert answer and the submission are very similar in their approach to solving the problem. Both queries are trying to calculate the sum of sales from the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\\n\\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\\n\\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of sales from the last quarter.\\n\\nCORRECT',\n",
+       " 'value': 'CORRECT',\n",
+       " 'score': 1}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"What's last quarter's sales numbers?\",\n",
+    "    prediction=\"\"\"SELECT SUM(sale_amount) AS last_quarter_sales\n",
+    "FROM sales\n",
+    "WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE();\n",
+    "\"\"\",\n",
+    "    reference=\"\"\"SELECT SUM(sub.sale_amount) AS last_quarter_sales\n",
+    "FROM (\n",
+    "    SELECT sale_amount\n",
+    "    FROM sales\n",
+    "    WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE()\n",
+    ") AS sub;\n",
+    "\"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e0c3dcad-408e-4d26-9e25-848ebacac2c4",
+   "metadata": {},
+   "source": [
+    "## Using Context\n",
+    "\n",
+    "Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the ContextQAEvalChain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9f3ae116-3a2f-461d-ba6f-7352b42c1b0c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.evaluation import ContextQAEvalChain\n",
+    "\n",
+    "eval_chain = ContextQAEvalChain.from_llm(llm=llm)\n",
+    "\n",
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"Who won the NFC championship game in 2023?\",\n",
+    "    prediction=\"Eagles\",\n",
+    "    reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba5eac17-08b6-4e4f-a896-79e7fc637018",
+   "metadata": {},
+   "source": [
+    "## CoT With Context\n",
+    "\n",
+    "The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.\n",
+    "The `CotQAEvalChain`'s default prompt instructs the model to do this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "26e3b686-98f4-45a5-9854-7071ec2893f1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning': 'The context states that the Philadelphia Eagles won the NFC championship game in 2023. The student\\'s answer, \"Eagles,\" matches the team that won according to the context. Therefore, the student\\'s answer is correct.',\n",
+       " 'value': 'CORRECT',\n",
+       " 'score': 1}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain.evaluation import CotQAEvalChain\n",
+    "\n",
+    "eval_chain = CotQAEvalChain.from_llm(llm=llm)\n",
+    "\n",
+    "eval_chain.evaluate_strings(\n",
+    "    input=\"Who won the NFC championship game in 2023?\",\n",
+    "    prediction=\"Eagles\",\n",
+    "    reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/trajectory/custom.ipynb
+++ b/docs/extras/modules/evaluation/trajectory/custom.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "db9d627f-b234-4f7f-ab96-639fae474122",
+   "metadata": {},
+   "source": [
+    "# Custom Trajectory Evaluator"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb
+++ b/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6e5ea1a1-7e74-459b-bf14-688f87d09124",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Agent Trajectory\n",
+    "\n",
+    "Agents take actions in pursuit of a goal. \"Trajectories\" record the intermediate steps\n",
+    "taken by the agent. You can use the the `TrajectoryEvalChain` to grade how effective these steps\n",
+    "are at achieving the correct response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "149402da-5212-43e2-b7c0-a701727f5293",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.evaluation import TrajectoryEvalChain\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "chain = TrajectoryEvalChain.from_llm(llm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e733562c-4c17-4942-9647-acfc5ebfaca2",
+   "metadata": {},
+   "source": [
+    "## Capturing Trajectory\n",
+    "\n",
+    "To return the trajectory, initialize an agent with `return_intermediate_steps=True`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain.tools import tool\n",
+    "from langchain.agents import AgentType, initialize_agent\n",
+    "from pydantic import HttpUrl\n",
+    "import subprocess\n",
+    "from urllib.parse import urlparse\n",
+    "\n",
+    "@tool\n",
+    "def ping(url: HttpUrl, return_error: bool) -> str:\n",
+    "    \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n",
+    "    hostname = urlparse(str(url)).netloc\n",
+    "    completed_process = subprocess.run(['ping', '-c', '1', hostname], capture_output=True, text=True)\n",
+    "    output = completed_process.stdout\n",
+    "    if return_error and completed_process.returncode != 0:\n",
+    "        return completed_process.stderr\n",
+    "    return output\n",
+    "\n",
+    "@tool\n",
+    "def trace_route(url: HttpUrl, return_error: bool) -> str:\n",
+    "    \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n",
+    "    hostname = urlparse(str(url)).netloc\n",
+    "    completed_process = subprocess.run(['traceroute', hostname], capture_output=True, text=True)\n",
+    "    output = completed_process.stdout\n",
+    "    if return_error and completed_process.returncode != 0:\n",
+    "        return completed_process.stderr\n",
+    "    return output\n",
+    "\n",
+    "\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
+    "agent = initialize_agent(\n",
+    "    llm=llm,\n",
+    "    tools=[ping, trace_route],\n",
+    "    agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n",
+    "    return_intermediate_steps=True # IMPORTANT!\n",
+    ")\n",
+    "\n",
+    "result = agent(\"What's the latency like for https://langchain.com?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
+   "metadata": {},
+   "source": [
+    "## Evaluate Trajectory\n",
+    "\n",
+    "Pass the input, trajectory, and output to the `evaluate_agent_trajectory` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'grade'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 6\u001b[0m\n\u001b[1;32m      1\u001b[0m evaluation_result \u001b[38;5;241m=\u001b[39m chain\u001b[38;5;241m.\u001b[39mevaluate_agent_trajectory(\n\u001b[1;32m      2\u001b[0m     prediction\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28minput\u001b[39m\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      4\u001b[0m     agent_trajectory\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mintermediate_steps\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m      5\u001b[0m )\n\u001b[0;32m----> 6\u001b[0m \u001b[43mevaluation_result\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrade\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'grade'"
+     ]
+    }
+   ],
+   "source": [
+    "evaluation_result = chain.evaluate_agent_trajectory(\n",
+    "    prediction=result[\"output\"],\n",
+    "    input=result[\"input\"],\n",
+    "    agent_trajectory=result[\"intermediate_steps\"],\n",
+    ")\n",
+    "evaluation_result[\"grade\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "652f3e76-9f3e-40e3-bbf8-e62c37e447ac",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -9,6 +9,7 @@ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

 from pydantic import Field

+from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
@@ -186,10 +187,11 @@ The following is the expected answer. Use this to measure correctness:
    @classmethod
    def from_llm(
        cls,
-        llm: BaseChatModel,
+        llm: BaseLanguageModel,
        agent_tools: Optional[Sequence[BaseTool]] = None,
        output_parser: Optional[TrajectoryOutputParser] = None,
        return_reasoning: bool = False,
+        **kwargs: Any,
    ) -> "TrajectoryEvalChain":
        """Create a TrajectoryEvalChain object from a language model chain.

@@ -205,6 +207,10 @@ The following is the expected answer. Use this to measure correctness:
        Returns:
            TrajectoryEvalChain: The TrajectoryEvalChain object.
        """
+        if not isinstance(llm, BaseChatModel):
+            raise NotImplementedError(
+                "Only chat models supported by the current trajectory eval"
+            )
        if agent_tools:
            prompt = EVAL_CHAT_PROMPT
        else:
@@ -215,6 +221,7 @@ The following is the expected answer. Use this to measure correctness:
            return_reasoning=return_reasoning,
            eval_chain=eval_chain,
            output_parser=output_parser or TrajectoryOutputParser(),
+            **kwargs,
        )

    @property
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@@ -9,6 +9,7 @@ from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
+from langchain.evaluation.schema import PairwiseStringEvaluator
 from langchain.prompts.prompt import PromptTemplate
 from langchain.schema import BaseOutputParser

@@ -50,7 +51,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
        }


-class PairwiseStringEvalChain(LLMChain):
+class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMChain):
    """A chain for comparing the output of two models.

    Example:
@@ -80,13 +81,31 @@ class PairwiseStringEvalChain(LLMChain):
        default_factory=PairwiseStringResultOutputParser
    )

+    @property
+    def requires_reference(self) -> bool:
+        return "reference" in self.prompt.input_variables
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+            "\nTo use a reference, initialize PairwiseStringEvalChain with"
+            " `requires_reference=True` or with a prompt with 'reference' as an"
+            " input variable."
+        )
+
    @classmethod
    def from_llm(
        cls,
-        *,
        llm: BaseLanguageModel,
+        *,
        prompt: Optional[PromptTemplate] = None,
-        require_reference: bool = False,
+        requires_reference: bool = False,
        **kwargs: Any,
    ) -> PairwiseStringEvalChain:
        """Initialize the PairwiseStringEvalChain from an LLM.
@@ -94,7 +113,7 @@ class PairwiseStringEvalChain(LLMChain):
        Args:
            llm (BaseLanguageModel): The LLM to use.
            prompt (PromptTemplate, optional): The prompt to use.
-            require_reference (bool, optional): Whether to require a reference
+            requires_reference (bool, optional): Whether to require a reference
                string. Defaults to False.
            **kwargs (Any): Additional keyword arguments.

@@ -103,13 +122,13 @@ class PairwiseStringEvalChain(LLMChain):
        """
        expected_input_vars = {"prediction", "prediction_b", "input"}
        if prompt is None:
-            if require_reference:
+            if requires_reference:
                expected_input_vars.add("reference")
                prompt_ = PROMPT_WITH_REFERENCE
            else:
                prompt_ = PROMPT
        else:
-            if require_reference:
+            if requires_reference:
                expected_input_vars.add("reference")
            prompt_ = prompt

@@ -121,23 +140,32 @@ class PairwiseStringEvalChain(LLMChain):
        return cls(llm=llm, prompt=prompt_, **kwargs)

    def _prepare_input(
-        self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
+        self,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str],
+        reference: Optional[str],
    ) -> dict:
        input_ = {
            "prediction": prediction,
            "prediction_b": prediction_b,
-            "input": input,
        }
-        if reference is not None and "reference" in self.prompt.input_variables:
+        if self.requires_input:
+            if input is None:
+                raise ValueError("Input is required for this comparison evaluator")
+            input_["input"] = input
+        if self.requires_reference:
+            if reference is None:
+                raise ValueError("Reference is required for this comparison evaluator")
            input_["reference"] = reference
        return input_

-    def evaluate_string_pairs(
+    def _evaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
-        input: str,
+        input: Optional[str] = None,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@@ -168,12 +196,12 @@ class PairwiseStringEvalChain(LLMChain):
        )
        return result["text"]

-    async def aevaluate_string_pairs(
+    async def _aevaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
-        input: str,
+        input: Optional[str] = None,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@@ -2,12 +2,13 @@ from __future__ import annotations

 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union

-from pydantic import Field
+from pydantic import Extra, Field

 from langchain.base_language import BaseLanguageModel
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
+from langchain.evaluation.schema import StringEvaluator
 from langchain.schema import BaseOutputParser, BasePromptTemplate

 _SUPPORTED_CRITERIA = {
@@ -59,7 +60,7 @@ CRITERIA_TYPE = Union[
 ]


-class CriteriaEvalChain(LLMChain):
+class CriteriaEvalChain(StringEvaluator, LLMChain):
    """LLM Chain for evaluating runs against criteria.

    Parameters
@@ -96,11 +97,32 @@ class CriteriaEvalChain(LLMChain):
    >>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
    """

-    requires_reference: bool = False
-    """Whether the evaluation template expects a reference text."""
    output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
    """The parser to use to map the output to a structured result."""

+    class Config:
+        """Configuration for the QAEvalChain."""
+
+        extra = Extra.ignore
+
+    @property
+    def requires_reference(self) -> bool:
+        return "reference" in self.prompt.input_variables
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+            "\nTo use a reference, initialize CriteriaEvalChain with"
+            " `require_reference=True` or with a prompt with 'reference'"
+            " as an input variable."
+        )
+
    @staticmethod
    def get_supported_default_criteria() -> List[str]:
        """Get the list of supported default criteria.
@@ -122,7 +144,7 @@ class CriteriaEvalChain(LLMChain):
    @classmethod
    def resolve_criteria(
        cls,
-        criteria: CRITERIA_TYPE,
+        criteria: Optional[CRITERIA_TYPE],
    ) -> Dict[str, str]:
        """Resolve the criteria to evaluate.

@@ -148,6 +170,10 @@ class CriteriaEvalChain(LLMChain):
        {'relevance': 'Is the submission referring to a real quote from the text?',
         'coherence': 'Is the submission coherent, well-structured, and organized?'}
        """  # noqa: E501
+        if criteria is None:
+            return {
+                "helpfulness": _SUPPORTED_CRITERIA["helpfulness"],
+            }
        if isinstance(criteria, str):
            criteria_ = {criteria: _SUPPORTED_CRITERIA[criteria]}
        elif isinstance(criteria, ConstitutionalPrinciple):
@@ -172,7 +198,7 @@ class CriteriaEvalChain(LLMChain):
    def from_llm(
        cls,
        llm: BaseLanguageModel,
-        criteria: CRITERIA_TYPE,
+        criteria: Optional[CRITERIA_TYPE] = None,
        *,
        prompt: Optional[BasePromptTemplate] = None,
        requires_reference: bool = False,
@@ -184,7 +210,7 @@ class CriteriaEvalChain(LLMChain):
        ----------
        llm : BaseLanguageModel
            The language model to use for evaluation.
-        criteria : CRITERIA_TYPE
+        criteria : CRITERIA_TYPE - default=None for "helpfulness"
            The criteria to evaluate the runs against. It can be:
                -  a mapping of criterion names to descriptions
                -  a sequence of criterion names
@@ -252,7 +278,7 @@ class CriteriaEvalChain(LLMChain):
            input_["reference"] = reference
        return input_

-    def evaluate_strings(
+    def _evaluate_strings(
        self,
        *,
        prediction: str,
@@ -296,7 +322,7 @@ class CriteriaEvalChain(LLMChain):
        input_ = self._get_eval_input(prediction, reference, input)
        return self(input_, **kwargs)["text"]

-    async def aevaluate_strings(
+    async def _aevaluate_strings(
        self,
        *,
        prediction: str,
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@@ -8,6 +8,7 @@ from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
+from langchain.evaluation.schema import StringEvaluator


 def _parse_string_eval_output(text: str) -> dict:
@@ -26,6 +27,8 @@ def _parse_string_eval_output(text: str) -> dict:
    else:
        reasoning, verdict = splits
        reasoning = reasoning.strip()
+    if ":" in verdict:
+        verdict = verdict.split(":")[1].strip()
    score = (
        1
        if verdict.upper() == "CORRECT"
@@ -38,9 +41,17 @@ def _parse_string_eval_output(text: str) -> dict:
    }


-class QAEvalChain(LLMChain):
+class QAEvalChain(LLMChain, StringEvaluator):
    """LLM Chain specifically for evaluating question answering."""

+    @property
+    def requires_reference(self) -> bool:
+        return True
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
    @classmethod
    def from_llm(
        cls, llm: BaseLanguageModel, prompt: PromptTemplate = PROMPT, **kwargs: Any
@@ -90,7 +101,7 @@ class QAEvalChain(LLMChain):

        return self.apply(inputs, callbacks=callbacks)

-    def evaluate_strings(
+    def _evaluate_strings(
        self,
        *,
        prediction: str,
@@ -118,7 +129,7 @@ class QAEvalChain(LLMChain):
        )[0]
        return _parse_string_eval_output(result["text"])

-    async def aevaluate_strings(
+    async def _aevaluate_strings(
        self,
        *,
        prediction: str,
@@ -134,9 +145,17 @@ class QAEvalChain(LLMChain):
        return _parse_string_eval_output(result["text"])


-class ContextQAEvalChain(LLMChain):
+class ContextQAEvalChain(LLMChain, StringEvaluator):
    """LLM Chain specifically for evaluating QA w/o GT based on context"""

+    @property
+    def requires_reference(self) -> bool:
+        return True
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
    @classmethod
    def _validate_input_vars(cls, prompt: PromptTemplate) -> None:
        expected_input_vars = {"query", "context", "result"}
@@ -193,7 +212,7 @@ class ContextQAEvalChain(LLMChain):

        return self.apply(inputs, callbacks=callbacks)

-    def evaluate_strings(
+    def _evaluate_strings(
        self,
        *,
        prediction: str,
@@ -208,7 +227,7 @@ class ContextQAEvalChain(LLMChain):
        )[0]
        return _parse_string_eval_output(result["text"])

-    async def aevaluate_strings(
+    async def _aevaluate_strings(
        self,
        *,
        prediction: str,
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -1,14 +1,63 @@
 """Interfaces to be implemented by general evaluators."""
-from abc import abstractmethod
-from typing import Any, Optional, Protocol, runtime_checkable
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+from warnings import warn
+
+logger = logging.getLogger(__name__)


-@runtime_checkable
-class StringEvaluator(Protocol):
+class _EvalArgsMixin:
+    """Mixin for checking evaluation arguments."""
+
+    @property
+    def requires_reference(self) -> bool:
+        """Whether this evaluator requires a reference label."""
+        return False
+
+    @property
+    def requires_input(self) -> bool:
+        """Whether this evaluator requires an input string."""
+        return False
+
+    @property
+    def _skip_input_warning(self) -> str:
+        """Warning to show when input is ignored."""
+        return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Warning to show when reference is ignored."""
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+        )
+
+    def _check_evaluation_args(
+        self,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+    ) -> None:
+        if self.requires_input and input is None:
+            raise ValueError(f"{self.__class__.__name__} requires an input string.")
+        elif input is not None and not self.requires_input:
+            warn(self._skip_input_warning)
+        else:
+            pass
+        if self.requires_reference and reference is None:
+            raise ValueError(f"{self.__class__.__name__} requires a reference string.")
+        elif reference is not None and not self.requires_reference:
+            warn(self._skip_reference_warning)
+        else:
+            pass
+
+
+class StringEvaluator(_EvalArgsMixin, ABC):
    """Protocol for evaluating strings."""

    @abstractmethod
-    def evaluate_strings(
+    def _evaluate_strings(
        self,
        *,
        prediction: str,
@@ -28,7 +77,7 @@ class StringEvaluator(Protocol):
            dict: The evaluation results containing the score or value.
        """

-    async def aevaluate_strings(
+    async def _aevaluate_strings(
        self,
        *,
        prediction: str,
@@ -53,13 +102,61 @@ class StringEvaluator(Protocol):
            "async aevaluate_strings method."
        )

+    def evaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate Chain or LLM output, based on optional input and label.

-@runtime_checkable
-class PairwiseStringEvaluator(Protocol):
+        Args:
+            prediction (str): the LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): the reference label
+                to evaluate against.
+            input (Optional[str], optional): the input to consider during evaluation
+            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )
+
+    async def aevaluate_strings(
+        self,
+        *,
+        prediction: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate Chain or LLM output, based on optional
+          input and label.
+
+        Args:
+            prediction (str): the LLM or chain prediction to evaluate.
+            reference (Optional[str], optional): the reference label
+                 to evaluate against.
+            input (Optional[str], optional): the input to consider during evaluation
+            **kwargs: additional keyword arguments, including callbacks, tags, etc.
+        Returns:
+            dict: The evaluation results containing the score or value.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_strings(
+            prediction=prediction, reference=reference, input=input, **kwargs
+        )
+
+
+class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
    """A protocol for comparing the output of two models."""

    @abstractmethod
-    def evaluate_string_pairs(
+    def _evaluate_string_pairs(
        self,
        *,
        prediction: str,
@@ -84,8 +181,9 @@ class PairwiseStringEvaluator(Protocol):
                other information.
        """

-    async def aevaluate_string_pairs(
+    async def _aevaluate_string_pairs(
        self,
+        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
@@ -111,3 +209,69 @@ class PairwiseStringEvaluator(Protocol):
            f"{self.__class__.__name__} hasn't implemented an async "
            "aevaluate_string_pairs method."
        )
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the output string pairs.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            reference (str, optional): The expected output / reference
+                string. Defaults to None.
+            input (str, optional): The input string. Defaults to None.
+            **kwargs (Any): Additional keyword arguments, such
+                as callbacks and optional reference strings.
+
+        Returns:
+            dict: A dictionary containing the preference, scores, and/or
+                other information.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_string_pairs(
+            prediction=prediction,
+            prediction_b=prediction_b,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the output string pairs.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            reference (str, optional): The expected output / reference
+                string. Defaults to None.
+            input (str, optional): The input string. Defaults to None.
+            **kwargs (Any): Additional keyword arguments, such
+                as callbacks and optional reference strings.
+
+        Returns:
+            dict: A dictionary containing the preference, scores, and/or
+                other information.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_string_pairs(
+            prediction=prediction,
+            prediction_b=prediction_b,
+            reference=reference,
+            input=input,
+            **kwargs,
+        )
--- a/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py
@@ -1,13 +1,15 @@
 """Test agent trajectory evaluation chain."""

-from typing import List, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 import pytest
+from pydantic import Field

+from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
-from langchain.schema import AgentAction
+from langchain.schema import AgentAction, BaseMessage
 from langchain.tools.base import tool
-from tests.unit_tests.llms.fake_llm import FakeLLM
+from tests.unit_tests.llms.fake_chat_model import FakeChatModel


@pytest.fixture
@@ -30,10 +32,31 @@ def foo(bar: str) -> str:
    return bar


+class _FakeTrajectoryChatModel(FakeChatModel):
+    queries: Dict = Field(default_factory=dict)
+    sequential_responses: Optional[bool] = False
+    response_index: int = 0
+
+    def _call(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        if self.sequential_responses:
+            response = self.queries[list(self.queries.keys())[self.response_index]]
+            self.response_index = self.response_index + 1
+            return response
+        else:
+            prompt = messages[0].content
+            return self.queries[prompt]
+
+
 def test_trajectory_eval_chain(
    intermediate_steps: List[Tuple[AgentAction, str]]
 ) -> None:
-    llm = FakeLLM(
+    llm = _FakeTrajectoryChatModel(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
@@ -61,7 +84,7 @@ def test_trajectory_eval_chain(
 def test_trajectory_eval_chain_no_tools(
    intermediate_steps: List[Tuple[AgentAction, str]]
 ) -> None:
-    llm = FakeLLM(
+    llm = _FakeTrajectoryChatModel(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
@@ -85,7 +108,7 @@ def test_trajectory_eval_chain_no_tools(


 def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> None:
-    llm = FakeLLM(
+    llm = _FakeTrajectoryChatModel(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
--- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@@ -1,6 +1,8 @@
 """Test the comparison chains."""


+import pytest
+
 from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
 from tests.unit_tests.llms.fake_llm import FakeLLM

@@ -30,10 +32,30 @@ def test_pairwise_string_comparison_chain() -> None:
    )
    assert res["value"] == "A"
    assert res["score"] == 1
-    res = chain.evaluate_string_pairs(
-        prediction="I like pie.",
-        prediction_b="I hate pie.",
-        input="What is your favorite food?",
-    )
+    with pytest.warns(UserWarning, match=chain._skip_reference_warning):
+        res = chain.evaluate_string_pairs(
+            prediction="I like pie.",
+            prediction_b="I hate pie.",
+            input="What is your favorite food?",
+            reference="I enjoy pie.",
+        )
    assert res["value"] == "B"
    assert res["score"] == 0
+
+
+def test_pairwise_string_comparison_chain_missing_ref() -> None:
+    llm = FakeLLM(
+        queries={
+            "a": "The values are the same.\n[[C]]",
+            "b": "A is clearly better than b.\n[[A]]",
+            "c": "B is clearly better than a.\n[[B]]",
+        },
+        sequential_responses=True,
+    )
+    chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)
+    with pytest.raises(ValueError):
+        chain.evaluate_string_pairs(
+            prediction="I like pie.",
+            prediction_b="I love pie.",
+            input="What is your favorite food?",
+        )
--- a/tests/unit_tests/evaluation/criteria/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/criteria/test_eval_chain.py
@@ -1,6 +1,8 @@
 """Test the criteria eval chain."""


+import pytest
+
 from langchain.evaluation.criteria.eval_chain import (
    _SUPPORTED_CRITERIA,
    CriteriaEvalChain,
@@ -25,11 +27,25 @@ def test_criteria_eval_chain() -> None:
        ),
        criteria={"my criterion": "my criterion description"},
    )
-    result = chain.evaluate_strings(
-        prediction="my prediction", reference="my reference", input="my input"
-    )
+    with pytest.warns(UserWarning, match=chain._skip_reference_warning):
+        result = chain.evaluate_strings(
+            prediction="my prediction", reference="my reference", input="my input"
+        )
    assert result["reasoning"] == "The meaning of life"


+def test_criteria_eval_chain_missing_reference() -> None:
+    chain = CriteriaEvalChain.from_llm(
+        llm=FakeLLM(
+            queries={"text": "The meaning of life\nY"},
+            sequential_responses=True,
+        ),
+        requires_reference=True,
+        criteria={"my criterion": "my criterion description"},
+    )
+    with pytest.raises(ValueError):
+        chain.evaluate_strings(prediction="my prediction", input="my input")
+
+
 def test_implements_string_protocol() -> None:
-    assert isinstance(CriteriaEvalChain, StringEvaluator)
+    assert issubclass(CriteriaEvalChain, StringEvaluator)
--- a/tests/unit_tests/evaluation/qa/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/qa/test_eval_chain.py
@@ -52,7 +52,7 @@ def test_context_eval_chain(chain_cls: Type[ContextQAEvalChain]) -> None:
 def test_implements_string_evaluator_protocol(
    chain_cls: Type[LLMChain],
 ) -> None:
-    assert isinstance(chain_cls, StringEvaluator)
+    assert issubclass(chain_cls, StringEvaluator)


@pytest.mark.parametrize("chain_cls", [QAEvalChain, ContextQAEvalChain, CotQAEvalChain])