mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 19:47:13 +00:00
mv eval docs (#8209)
This commit is contained in:
@@ -1,280 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Custom Pairwise Evaluator\n",
|
||||
"\n",
|
||||
"You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
|
||||
"\n",
|
||||
"In this example, you will make a simple custom evaluator that just returns whether the first prediction has more whitespace tokenized 'words' than the second.\n",
|
||||
"\n",
|
||||
"You can check out the reference docs for the [PairwiseStringEvaluator interface](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.PairwiseStringEvaluator.html#langchain.evaluation.schema.PairwiseStringEvaluator) for more info.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "93f3a653-d198-4291-973c-8d1adba338b2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Optional, Any\n",
|
||||
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class LengthComparisonPairwiseEvalutor(PairwiseStringEvaluator):\n",
|
||||
" \"\"\"\n",
|
||||
" Custom evaluator to compare two strings.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def _evaluate_string_pairs(\n",
|
||||
" self,\n",
|
||||
" *,\n",
|
||||
" prediction: str,\n",
|
||||
" prediction_b: str,\n",
|
||||
" reference: Optional[str] = None,\n",
|
||||
" input: Optional[str] = None,\n",
|
||||
" **kwargs: Any,\n",
|
||||
" ) -> dict:\n",
|
||||
" score = int(len(prediction.split()) > len(prediction_b.split()))\n",
|
||||
" return {\"score\": score}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "7d4a77c3-07a7-4076-8e7f-f9bca0d6c290",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator = LengthComparisonPairwiseEvalutor()\n",
|
||||
"\n",
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"The quick brown fox jumped over the lazy dog.\",\n",
|
||||
" prediction_b=\"The quick brown fox jumped over the dog.\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d90f128f-6f49-42a1-b05a-3aea568ee03b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## LLM-Based Example\n",
|
||||
"\n",
|
||||
"That example was simple to illustrate the API, but it wasn't very useful in practice. Below, use an LLM with some custom instructions to form a simple preference scorer similar to the built-in [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain). We will use `ChatAnthropic` for the evaluator chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b4b43098-4d96-417b-a8a9-b3e75779cfe8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install anthropic\n",
|
||||
"# %env ANTHROPIC_API_KEY=YOUR_API_KEY"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Optional, Any\n",
|
||||
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
||||
"from langchain.chat_models import ChatAnthropic\n",
|
||||
"from langchain.chains import LLMChain\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CustomPreferenceEvaluator(PairwiseStringEvaluator):\n",
|
||||
" \"\"\"\n",
|
||||
" Custom evaluator to compare two strings using a custom LLMChain.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self) -> None:\n",
|
||||
" llm = ChatAnthropic(model=\"claude-2\", temperature=0)\n",
|
||||
" self.eval_chain = LLMChain.from_string(\n",
|
||||
" llm,\n",
|
||||
" \"\"\"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
||||
"\n",
|
||||
"Input: How do I get the path of the parent directory in python 3.8?\n",
|
||||
"Option A: You can use the following code:\n",
|
||||
"```python\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
|
||||
"```\n",
|
||||
"Option B: You can use the following code:\n",
|
||||
"```python\n",
|
||||
"from pathlib import Path\n",
|
||||
"Path(__file__).absolute().parent\n",
|
||||
"```\n",
|
||||
"Reasoning: Both options return the same result. However, since option B is more concise and easily understand, it is preferred.\n",
|
||||
"Preference: B\n",
|
||||
"\n",
|
||||
"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
||||
"Input: {input}\n",
|
||||
"Option A: {prediction}\n",
|
||||
"Option B: {prediction_b}\n",
|
||||
"Reasoning:\"\"\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" @property\n",
|
||||
" def requires_input(self) -> bool:\n",
|
||||
" return True\n",
|
||||
"\n",
|
||||
" @property\n",
|
||||
" def requires_reference(self) -> bool:\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" def _evaluate_string_pairs(\n",
|
||||
" self,\n",
|
||||
" *,\n",
|
||||
" prediction: str,\n",
|
||||
" prediction_b: str,\n",
|
||||
" reference: Optional[str] = None,\n",
|
||||
" input: Optional[str] = None,\n",
|
||||
" **kwargs: Any,\n",
|
||||
" ) -> dict:\n",
|
||||
" result = self.eval_chain(\n",
|
||||
" {\n",
|
||||
" \"input\": input,\n",
|
||||
" \"prediction\": prediction,\n",
|
||||
" \"prediction_b\": prediction_b,\n",
|
||||
" \"stop\": [\"Which option is preferred?\"],\n",
|
||||
" },\n",
|
||||
" **kwargs,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" response_text = result[\"text\"]\n",
|
||||
" reasoning, preference = response_text.split(\"Preference:\", maxsplit=1)\n",
|
||||
" preference = preference.strip()\n",
|
||||
" score = 1.0 if preference == \"A\" else (0.0 if preference == \"B\" else None)\n",
|
||||
" return {\"reasoning\": reasoning.strip(), \"value\": preference, \"score\": score}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluator = CustomPreferenceEvaluator()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'Option B is preferred over option A for importing from a relative directory, because it is more straightforward and concise.\\n\\nOption A uses the importlib module, which allows importing a module by specifying the full name as a string. While this works, it is less clear compared to option B.\\n\\nOption B directly imports from the relative path using dot notation, which clearly shows that it is a relative import. This is the recommended way to do relative imports in Python.\\n\\nIn summary, option B is more accurate and helpful as it uses the standard Python relative import syntax.',\n",
|
||||
" 'value': 'B',\n",
|
||||
" 'score': 0.0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" input=\"How do I import from a relative directory?\",\n",
|
||||
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
||||
" prediction_b=\"from .sibling import foo\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "f13a1346-7dbe-451d-b3a3-99e8fc7b753b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CustomPreferenceEvaluator requires an input string.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Setting requires_input to return True adds additional validation to avoid returning a grade when insufficient data is provided to the chain.\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
||||
" prediction_b=\"from .sibling import foo\",\n",
|
||||
" )\n",
|
||||
"except ValueError as e:\n",
|
||||
" print(e)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e7829cc3-ebd1-4628-ae97-15166202e9cc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,232 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Pairwise Embedding Distance \n",
|
||||
"\n",
|
||||
"One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||
"\n",
|
||||
"You can load the `pairwise_embedding_distance` evaluator to do this.\n",
|
||||
"\n",
|
||||
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the outputs are, according to their embedded representation.\n",
|
||||
"\n",
|
||||
"Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"pairwise_embedding_distance\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.0966466944859925}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.03761174337464557}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Select the Distance Metric\n",
|
||||
"\n",
|
||||
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
||||
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
||||
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
||||
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
||||
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation import EmbeddingDistance\n",
|
||||
"\n",
|
||||
"list(EmbeddingDistance)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Select Embeddings to Use\n",
|
||||
"\n",
|
||||
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"embedding_model = HuggingFaceEmbeddings()\n",
|
||||
"hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.5486443280477362}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"hf_evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.21018880025138598}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"hf_evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@@ -1,290 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2da95378",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pairwise String Comparison\n",
|
||||
"\n",
|
||||
"Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
|
||||
"\n",
|
||||
"- Which LLM or prompt produces a preferred output for a given question?\n",
|
||||
"- Which examples should I include for few-shot example selection?\n",
|
||||
"- Which output is better to include for fintetuning?\n",
|
||||
"\n",
|
||||
"The simplest and often most reliable automated way to choose a preferred prediction for a given input is to use the `pairwise_string` evaluator.\n",
|
||||
"\n",
|
||||
"Check out the reference docs for the [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f6790c46",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"pairwise_string\", requires_reference=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "49ad9139",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference answer. Although Response B is less detailed, it is accurate and directly answers the question. \\n\\nTherefore, the better response is [[B]].\\n',\n",
|
||||
" 'value': 'B',\n",
|
||||
" 'score': 0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"there are three dogs\",\n",
|
||||
" prediction_b=\"4\",\n",
|
||||
" input=\"how many dogs are in the park?\",\n",
|
||||
" reference=\"four\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed353b93-be71-4479-b9c0-8c97814c2e58",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Without References\n",
|
||||
"\n",
|
||||
"When references aren't available, you can still predict the preferred response.\n",
|
||||
"The results will reflect the evaluation model's preference, which is less reliable and may result\n",
|
||||
"in preferences that are factually incorrect."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "586320da",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"pairwise_string\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': \"Response A is accurate but lacks depth and detail. It simply states that addition is a mathematical operation without explaining what it does or how it works. \\n\\nResponse B, on the other hand, provides a more detailed explanation. It not only identifies addition as a mathematical operation, but also explains that it involves adding two numbers to create a third number, the 'sum'. This response is more helpful and informative, providing a clearer understanding of what addition is.\\n\\nTherefore, the better response is B.\\n\",\n",
|
||||
" 'value': 'B',\n",
|
||||
" 'score': 0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"Addition is a mathematical operation.\",\n",
|
||||
" prediction_b=\"Addition is a mathematical operation that adds two numbers to create a third number, the 'sum'.\",\n",
|
||||
" input=\"What is addition?\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a25b60b2-627c-408a-be4b-a2e5cbc10726",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Customize the LLM\n",
|
||||
"\n",
|
||||
"By default, the loader uses `gpt-4` in the evaluation chain. You can customize this when loading."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "de84a958-1330-482b-b950-68bcf23f9e35",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatAnthropic\n",
|
||||
"\n",
|
||||
"llm = ChatAnthropic(temperature=0)\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"pairwise_string\", llm=llm, requires_reference=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e162153f-d50a-4a7c-a033-019dabbc954c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'Response A provides a specific number but is inaccurate based on the reference answer. Response B provides the correct number but lacks detail or explanation. Overall, Response B is more helpful and accurate in directly answering the question, despite lacking depth or creativity.\\n\\n[[B]]\\n',\n",
|
||||
" 'value': 'B',\n",
|
||||
" 'score': 0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"there are three dogs\",\n",
|
||||
" prediction_b=\"4\",\n",
|
||||
" input=\"how many dogs are in the park?\",\n",
|
||||
" reference=\"four\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e0e89c13-d0ad-4f87-8fcb-814399bafa2a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Customize the Evaluation Prompt\n",
|
||||
"\n",
|
||||
"You can use your own custom evaluation prompt to add more task-specific instructions or to instruct the evaluator to score the output.\n",
|
||||
"\n",
|
||||
"*Note: If you use a prompt that expects generates a result in a unique format, you may also have to pass in a custom output parser (`output_parser=your_parser()`) instead of the default `PairwiseStringResultOutputParser`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "fb817efa-3a4d-439d-af8c-773b89d97ec9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"prompt_template = PromptTemplate.from_template(\n",
|
||||
" \"\"\"Given the input context, which is most similar to the reference label: A or B?\n",
|
||||
"Reason step by step and finally, respond with either [[A]] or [[B]] on its own line.\n",
|
||||
"\n",
|
||||
"DATA\n",
|
||||
"----\n",
|
||||
"input: {input}\n",
|
||||
"reference: {reference}\n",
|
||||
"A: {prediction}\n",
|
||||
"B: {prediction_b}\n",
|
||||
"---\n",
|
||||
"Reasoning:\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
")\n",
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"pairwise_string\", prompt=prompt_template, requires_reference=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "d40aa4f0-cfd5-4cb4-83c8-8d2300a04c2f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"input_variables=['input', 'prediction', 'prediction_b', 'reference'] output_parser=None partial_variables={} template='Given the input context, which is most similar to the reference label: A or B?\\nReason step by step and finally, respond with either [[A]] or [[B]] on its own line.\\n\\nDATA\\n----\\ninput: {input}\\nreference: {reference}\\nA: {prediction}\\nB: {prediction_b}\\n---\\nReasoning:\\n\\n' template_format='f-string' validate_template=True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The prompt was assigned to the evaluator\n",
|
||||
"print(evaluator.prompt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "9467bb42-7a31-4071-8f66-9ed2c6f06dcd",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': \"Option A is most similar to the reference label. Both the reference label and option A state that the dog's name is Fido. Option B, on the other hand, gives a different name for the dog. Therefore, option A is the most similar to the reference label. \\n\",\n",
|
||||
" 'value': 'A',\n",
|
||||
" 'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_string_pairs(\n",
|
||||
" prediction=\"The dog that ate the ice cream was named fido.\",\n",
|
||||
" prediction_b=\"The dog's name is spot\",\n",
|
||||
" input=\"What is the name of the dog that ate the ice cream?\",\n",
|
||||
" reference=\"The dog's name is fido\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,511 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "984169ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Agent VectorDB Question Answering Benchmarking\n",
|
||||
"\n",
|
||||
"Here we go over how to benchmark performance on a question answering task using an agent to route between multiple vectordatabases.\n",
|
||||
"\n",
|
||||
"It is highly recommended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/guides/tracing/) for an explanation of what tracing is and how to set it up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a16b75d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the data\n",
|
||||
"First, let's load the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "5b2d5e98",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found cached dataset json (/Users/qt/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-vectordb-qa-sota-pg-d3ae24016b514f92/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)\n",
|
||||
"100%|██████████| 1/1 [00:00<00:00, 414.42it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"agent-vectordb-qa-sota-pg\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "61375342",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the purpose of the NATO Alliance?',\n",
|
||||
" 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
|
||||
" 'steps': [{'tool': 'State of Union QA System', 'tool_input': None},\n",
|
||||
" {'tool': None, 'tool_input': 'What is the purpose of the NATO Alliance?'}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "02500304",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the purpose of YC?',\n",
|
||||
" 'answer': 'The purpose of YC is to cause startups to be founded that would not otherwise have existed.',\n",
|
||||
" 'steps': [{'tool': 'Paul Graham QA System', 'tool_input': None},\n",
|
||||
" {'tool': None, 'tool_input': 'What is the purpose of YC?'}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset[-1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4ab6a716",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setting up a chain\n",
|
||||
"Now we need to create some pipelines for doing question answering. Step one in that is creating indexes over the data in question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c18680b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "7f0de2b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.indexes import VectorstoreIndexCreator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "ef84ff99",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using embedded DuckDB without persistence: data will be transient\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore_sota = (\n",
|
||||
" VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"sota\"})\n",
|
||||
" .from_loaders([loader])\n",
|
||||
" .vectorstore\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0b5d8f6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create a question answering chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "8843cb0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "573719a0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain_sota = RetrievalQA.from_chain_type(\n",
|
||||
" llm=OpenAI(temperature=0),\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" retriever=vectorstore_sota.as_retriever(),\n",
|
||||
" input_key=\"question\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e48b03d8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we do the same for the Paul Graham data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "c2dbb014",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader(\"../../modules/paul_graham_essay.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "98d16f08",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using embedded DuckDB without persistence: data will be transient\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore_pg = (\n",
|
||||
" VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"paul_graham\"})\n",
|
||||
" .from_loaders([loader])\n",
|
||||
" .vectorstore\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "ec0aab02",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain_pg = RetrievalQA.from_chain_type(\n",
|
||||
" llm=OpenAI(temperature=0),\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" retriever=vectorstore_pg.as_retriever(),\n",
|
||||
" input_key=\"question\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "76b5f8fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can now set up an agent to route between them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "ade1aafa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents import initialize_agent, Tool\n",
|
||||
"from langchain.agents import AgentType\n",
|
||||
"\n",
|
||||
"tools = [\n",
|
||||
" Tool(\n",
|
||||
" name=\"State of Union QA System\",\n",
|
||||
" func=chain_sota.run,\n",
|
||||
" description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question.\",\n",
|
||||
" ),\n",
|
||||
" Tool(\n",
|
||||
" name=\"Paul Graham System\",\n",
|
||||
" func=chain_pg.run,\n",
|
||||
" description=\"useful for when you need to answer questions about Paul Graham. Input should be a fully formed question.\",\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "104853f8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent = initialize_agent(\n",
|
||||
" tools,\n",
|
||||
" OpenAI(temperature=0),\n",
|
||||
" agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
|
||||
" max_iterations=4,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7f036641",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make a prediction\n",
|
||||
"\n",
|
||||
"First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "4664e79f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(dataset[0][\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d0c16cd7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make many predictions\n",
|
||||
"Now we can make predictions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "799f6c17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = []\n",
|
||||
"predicted_dataset = []\n",
|
||||
"error_dataset = []\n",
|
||||
"for data in dataset:\n",
|
||||
" new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n",
|
||||
" try:\n",
|
||||
" predictions.append(agent(new_data))\n",
|
||||
" predicted_dataset.append(new_data)\n",
|
||||
" except Exception:\n",
|
||||
" error_dataset.append(new_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "49d969fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate performance\n",
|
||||
"Now we can evaluate the predictions. The first thing we can do is look at them by eye."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "1d583f03",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'input': 'What is the purpose of the NATO Alliance?',\n",
|
||||
" 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
|
||||
" 'output': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"predictions[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4783344b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we can use a language model to score them programatically"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "d0a9341d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "1612dec1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" predicted_dataset, predictions, question_key=\"input\", prediction_key=\"output\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79587806",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can add in the graded output to the `predictions` dict and then get a count of the grades."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "2a689df5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i, prediction in enumerate(predictions):\n",
|
||||
" prediction[\"grade\"] = graded_outputs[i][\"text\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "27b61215",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({' CORRECT': 28, ' INCORRECT': 5})"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"Counter([pred[\"grade\"] for pred in predictions])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12fe30f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also filter the datapoints to the incorrect examples and look at them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "47c692a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "0ef976c1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'input': 'What are the four common sense steps that the author suggests to move forward safely?',\n",
|
||||
" 'answer': 'The four common sense steps suggested by the author to move forward safely are: stay protected with vaccines and treatments, prepare for new variants, end the shutdown of schools and businesses, and stay vigilant.',\n",
|
||||
" 'output': 'The four common sense steps suggested in the most recent State of the Union address are: cutting the cost of prescription drugs, providing a pathway to citizenship for Dreamers, revising laws so businesses have the workers they need and families don’t wait decades to reunite, and protecting access to health care and preserving a woman’s right to choose.',\n",
|
||||
" 'grade': ' INCORRECT'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"incorrect[0]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,450 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Comparing Chain Outputs\n",
|
||||
"\n",
|
||||
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
||||
"\n",
|
||||
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
||||
"\n",
|
||||
"For this evaluation, we will need 3 things:\n",
|
||||
"1. An evaluator\n",
|
||||
"2. A dataset of inputs\n",
|
||||
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
||||
"\n",
|
||||
"Then we will aggregate the restults to determine the preferred model.\n",
|
||||
"\n",
|
||||
"### Step 1. Create the Evaluator\n",
|
||||
"\n",
|
||||
"In this example, you will use gpt-4 to select which output is preferred."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.evaluation.comparison import PairwiseStringEvalChain\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-4\")\n",
|
||||
"\n",
|
||||
"eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Step 2. Select Dataset\n",
|
||||
"\n",
|
||||
"If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
|
||||
"provide more reliable results. We will use some example queries someone might have about how to use langchain here."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d852a1884480457292c90d8bd9d4f1e6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"langchain-howto-queries\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Step 3. Define Models to Compare\n",
|
||||
"\n",
|
||||
"We will be comparing two agents in this case."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import SerpAPIWrapper\n",
|
||||
"from langchain.agents import initialize_agent, Tool\n",
|
||||
"from langchain.agents import AgentType\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Initialize the language model\n",
|
||||
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
|
||||
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
|
||||
"\n",
|
||||
"# Initialize the SerpAPIWrapper for search functionality\n",
|
||||
"# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
||||
"search = SerpAPIWrapper()\n",
|
||||
"\n",
|
||||
"# Define a list of tools offered by the agent\n",
|
||||
"tools = [\n",
|
||||
" Tool(\n",
|
||||
" name=\"Search\",\n",
|
||||
" func=search.run,\n",
|
||||
" coroutine=search.arun,\n",
|
||||
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"functions_agent = initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
|
||||
")\n",
|
||||
"conversations_agent = initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Step 4. Generate Responses\n",
|
||||
"\n",
|
||||
"We will generate outputs for each of the models before evaluating them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "b076d6bf6680422aa9082d4bad4d98a3",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/20 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n",
|
||||
"Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"import asyncio\n",
|
||||
"\n",
|
||||
"results = []\n",
|
||||
"agents = [functions_agent, conversations_agent]\n",
|
||||
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
||||
"\n",
|
||||
"# We will only run the first 20 examples of this dataset to speed things up\n",
|
||||
"# This will lead to larger confidence intervals downstream.\n",
|
||||
"batch = []\n",
|
||||
"for example in tqdm(dataset[:20]):\n",
|
||||
" batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
|
||||
" if len(batch) >= concurrency_level:\n",
|
||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
|
||||
" batch = []\n",
|
||||
"if batch:\n",
|
||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5. Evaluate Pairs\n",
|
||||
"\n",
|
||||
"Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
|
||||
"\n",
|
||||
"Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def predict_preferences(dataset, results) -> list:\n",
|
||||
" preferences = []\n",
|
||||
"\n",
|
||||
" for example, (res_a, res_b) in zip(dataset, results):\n",
|
||||
" input_ = example[\"inputs\"]\n",
|
||||
" # Flip a coin to reduce persistent position bias\n",
|
||||
" if random.random() < 0.5:\n",
|
||||
" pred_a, pred_b = res_a, res_b\n",
|
||||
" a, b = \"a\", \"b\"\n",
|
||||
" else:\n",
|
||||
" pred_a, pred_b = res_b, res_a\n",
|
||||
" a, b = \"b\", \"a\"\n",
|
||||
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||
" prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||
" prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||
" input=input_,\n",
|
||||
" )\n",
|
||||
" if eval_res[\"value\"] == \"A\":\n",
|
||||
" preferences.append(a)\n",
|
||||
" elif eval_res[\"value\"] == \"B\":\n",
|
||||
" preferences.append(b)\n",
|
||||
" else:\n",
|
||||
" preferences.append(None) # No preference\n",
|
||||
" return preferences"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preferences = predict_preferences(dataset, results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"**Print out the ratio of preferences.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OpenAI Functions Agent: 90.00%\n",
|
||||
"Structured Chat Agent: 10.00%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"name_map = {\n",
|
||||
" \"a\": \"OpenAI Functions Agent\",\n",
|
||||
" \"b\": \"Structured Chat Agent\",\n",
|
||||
"}\n",
|
||||
"counts = Counter(preferences)\n",
|
||||
"pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
|
||||
"for k, v in pref_ratios.items():\n",
|
||||
" print(f\"{name_map.get(k)}: {v:.2%}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Estimate Confidence Intervals\n",
|
||||
"\n",
|
||||
"The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
|
||||
"\n",
|
||||
"Below, use the Wilson score to estimate the confidence interval."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from math import sqrt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def wilson_score_interval(\n",
|
||||
" preferences: list, which: str = \"a\", z: float = 1.96\n",
|
||||
") -> tuple:\n",
|
||||
" \"\"\"Estimate the confidence interval using the Wilson score.\n",
|
||||
"\n",
|
||||
" See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
|
||||
" for more details, including when to use it and when it should not be used.\n",
|
||||
" \"\"\"\n",
|
||||
" total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
|
||||
" n_s = preferences.count(which)\n",
|
||||
"\n",
|
||||
" if total_preferences == 0:\n",
|
||||
" return (0, 0)\n",
|
||||
"\n",
|
||||
" p_hat = n_s / total_preferences\n",
|
||||
"\n",
|
||||
" denominator = 1 + (z**2) / total_preferences\n",
|
||||
" adjustment = (z / denominator) * sqrt(\n",
|
||||
" p_hat * (1 - p_hat) / total_preferences\n",
|
||||
" + (z**2) / (4 * total_preferences * total_preferences)\n",
|
||||
" )\n",
|
||||
" center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
|
||||
" lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
|
||||
" upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
|
||||
"\n",
|
||||
" return (lower_bound, upper_bound)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The \"OpenAI Functions Agent\" would be preferred between 69.90% and 97.21% percent of the time (with 95% confidence).\n",
|
||||
"The \"Structured Chat Agent\" would be preferred between 2.79% and 30.10% percent of the time (with 95% confidence).\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for which_, name in name_map.items():\n",
|
||||
" low, high = wilson_score_interval(preferences, which=which_)\n",
|
||||
" print(\n",
|
||||
" f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Print out the p-value.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The p-value is 0.00040. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||
"then there is a 0.04025% chance of observing the OpenAI Functions Agent be preferred at least 18\n",
|
||||
"times out of 20 trials.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from scipy import stats\n",
|
||||
"\n",
|
||||
"preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
|
||||
"successes = preferences.count(preferred_model)\n",
|
||||
"n = len(preferences) - preferences.count(None)\n",
|
||||
"p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
|
||||
"print(\n",
|
||||
" f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||
"then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
|
||||
"times out of {n} trials.\"\"\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
|
||||
"LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
|
||||
"In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@@ -1,445 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e78b7bb1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Augmented Question Answering\n",
|
||||
"\n",
|
||||
"This notebook uses some generic prompts/language models to evaluate an question answering system that uses other sources of data besides what is in the model. For example, this can be used to evaluate a question answering system over your proprietary data.\n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"Let's set up an example with our favorite example - the state of the union address."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "ab4a6931",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.chains import RetrievalQA"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4fdc211d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"docsearch = Chroma.from_documents(texts, embeddings)\n",
|
||||
"qa = RetrievalQA.from_llm(llm=OpenAI(), retriever=docsearch.as_retriever())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "30fd72f2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Examples\n",
|
||||
"Now we need some examples to evaluate. We can do this in two ways:\n",
|
||||
"\n",
|
||||
"1. Hard code some examples ourselves\n",
|
||||
"2. Generate examples automatically, using a language model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "3459b001",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Hard-coded examples\n",
|
||||
"examples = [\n",
|
||||
" {\n",
|
||||
" \"query\": \"What did the president say about Ketanji Brown Jackson\",\n",
|
||||
" \"answer\": \"He praised her legal ability and said he nominated her for the supreme court.\",\n",
|
||||
" },\n",
|
||||
" {\"query\": \"What did the president say about Michael Jackson\", \"answer\": \"Nothing\"},\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b9c3fa75",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generated examples\n",
|
||||
"from langchain.evaluation.qa import QAGenerateChain\n",
|
||||
"\n",
|
||||
"example_gen_chain = QAGenerateChain.from_llm(OpenAI())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c24543a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"new_examples = example_gen_chain.apply_and_parse([{\"doc\": t} for t in texts[:5]])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a2d27560",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'query': 'According to the document, what did Vladimir Putin miscalculate?',\n",
|
||||
" 'answer': 'He miscalculated that he could roll into Ukraine and the world would roll over.'},\n",
|
||||
" {'query': 'Who is the Ukrainian Ambassador to the United States?',\n",
|
||||
" 'answer': 'The Ukrainian Ambassador to the United States is here tonight.'},\n",
|
||||
" {'query': 'How many countries were part of the coalition formed to confront Putin?',\n",
|
||||
" 'answer': '27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.'},\n",
|
||||
" {'query': 'What action is the U.S. Department of Justice taking to target Russian oligarchs?',\n",
|
||||
" 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.'},\n",
|
||||
" {'query': 'How much direct assistance is the United States providing to Ukraine?',\n",
|
||||
" 'answer': 'The United States is providing more than $1 Billion in direct assistance to Ukraine.'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"new_examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "558da6f3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Combine examples\n",
|
||||
"examples += new_examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "443dc34e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate\n",
|
||||
"Now that we have examples, we can use the question answering evaluator to evaluate our question answering chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "782169a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "1bb77416",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = qa.apply(examples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "bcd0ad7f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "2e6af79a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"graded_outputs = eval_chain.evaluate(examples, predictions)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "32fac2dc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Example 0:\n",
|
||||
"Question: What did the president say about Ketanji Brown Jackson\n",
|
||||
"Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n",
|
||||
"Predicted Answer: The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n",
|
||||
"Example 1:\n",
|
||||
"Question: What did the president say about Michael Jackson\n",
|
||||
"Real Answer: Nothing\n",
|
||||
"Predicted Answer: The president did not mention Michael Jackson in this speech.\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n",
|
||||
"Example 2:\n",
|
||||
"Question: According to the document, what did Vladimir Putin miscalculate?\n",
|
||||
"Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n",
|
||||
"Predicted Answer: Putin miscalculated that the world would roll over when he rolled into Ukraine.\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n",
|
||||
"Example 3:\n",
|
||||
"Question: Who is the Ukrainian Ambassador to the United States?\n",
|
||||
"Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n",
|
||||
"Predicted Answer: I don't know.\n",
|
||||
"Predicted Grade: INCORRECT\n",
|
||||
"\n",
|
||||
"Example 4:\n",
|
||||
"Question: How many countries were part of the coalition formed to confront Putin?\n",
|
||||
"Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
|
||||
"Predicted Answer: The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
|
||||
"Predicted Grade: INCORRECT\n",
|
||||
"\n",
|
||||
"Example 5:\n",
|
||||
"Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n",
|
||||
"Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n",
|
||||
"Predicted Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n",
|
||||
"Predicted Grade: INCORRECT\n",
|
||||
"\n",
|
||||
"Example 6:\n",
|
||||
"Question: How much direct assistance is the United States providing to Ukraine?\n",
|
||||
"Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n",
|
||||
"Predicted Answer: The United States is providing more than $1 billion in direct assistance to Ukraine.\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i, eg in enumerate(examples):\n",
|
||||
" print(f\"Example {i}:\")\n",
|
||||
" print(\"Question: \" + predictions[i][\"query\"])\n",
|
||||
" print(\"Real Answer: \" + predictions[i][\"answer\"])\n",
|
||||
" print(\"Predicted Answer: \" + predictions[i][\"result\"])\n",
|
||||
" print(\"Predicted Grade: \" + graded_outputs[i][\"text\"])\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "50a9e845",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate with Other Metrics\n",
|
||||
"\n",
|
||||
"In addition to predicting whether the answer is correct or incorrect using a language model, we can also use other metrics to get a more nuanced view on the quality of the answers. To do so, we can use the [Critique](https://docs.inspiredco.ai/critique/) library, which allows for simple calculation of various metrics over generated text.\n",
|
||||
"\n",
|
||||
"First you can get an API key from the [Inspired Cognition Dashboard](https://dashboard.inspiredco.ai) and do some setup:\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"export INSPIREDCO_API_KEY=\"...\"\n",
|
||||
"pip install inspiredco\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "bd0b01dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import inspiredco.critique\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"critique = inspiredco.critique.Critique(api_key=os.environ[\"INSPIREDCO_API_KEY\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4f52629e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Then run the following code to set up the configuration and calculate the [ROUGE](https://docs.inspiredco.ai/critique/metric_rouge.html), [chrf](https://docs.inspiredco.ai/critique/metric_chrf.html), [BERTScore](https://docs.inspiredco.ai/critique/metric_bert_score.html), and [UniEval](https://docs.inspiredco.ai/critique/metric_uni_eval.html) (you can choose [other metrics](https://docs.inspiredco.ai/critique/metrics.html) too):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "84a0ba21",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metrics = {\n",
|
||||
" \"rouge\": {\n",
|
||||
" \"metric\": \"rouge\",\n",
|
||||
" \"config\": {\"variety\": \"rouge_l\"},\n",
|
||||
" },\n",
|
||||
" \"chrf\": {\n",
|
||||
" \"metric\": \"chrf\",\n",
|
||||
" \"config\": {},\n",
|
||||
" },\n",
|
||||
" \"bert_score\": {\n",
|
||||
" \"metric\": \"bert_score\",\n",
|
||||
" \"config\": {\"model\": \"bert-base-uncased\"},\n",
|
||||
" },\n",
|
||||
" \"uni_eval\": {\n",
|
||||
" \"metric\": \"uni_eval\",\n",
|
||||
" \"config\": {\"task\": \"summarization\", \"evaluation_aspect\": \"relevance\"},\n",
|
||||
" },\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "3b9a4056",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"critique_data = [\n",
|
||||
" {\"target\": pred[\"result\"], \"references\": [pred[\"answer\"]]} for pred in predictions\n",
|
||||
"]\n",
|
||||
"eval_results = {\n",
|
||||
" k: critique.evaluate(dataset=critique_data, metric=v[\"metric\"], config=v[\"config\"])\n",
|
||||
" for k, v in metrics.items()\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f0ae799",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we can print out the results. We can see that overall the scores are higher when the output is semantically correct, and also when the output closely matches with the gold-standard answer."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "b51edcf4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Example 0:\n",
|
||||
"Question: What did the president say about Ketanji Brown Jackson\n",
|
||||
"Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n",
|
||||
"Predicted Answer: The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n",
|
||||
"Predicted Scores: rouge=0.0941, chrf=0.2001, bert_score=0.5219, uni_eval=0.9043\n",
|
||||
"\n",
|
||||
"Example 1:\n",
|
||||
"Question: What did the president say about Michael Jackson\n",
|
||||
"Real Answer: Nothing\n",
|
||||
"Predicted Answer: The president did not mention Michael Jackson in this speech.\n",
|
||||
"Predicted Scores: rouge=0.0000, chrf=0.1087, bert_score=0.3486, uni_eval=0.7802\n",
|
||||
"\n",
|
||||
"Example 2:\n",
|
||||
"Question: According to the document, what did Vladimir Putin miscalculate?\n",
|
||||
"Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n",
|
||||
"Predicted Answer: Putin miscalculated that the world would roll over when he rolled into Ukraine.\n",
|
||||
"Predicted Scores: rouge=0.5185, chrf=0.6955, bert_score=0.8421, uni_eval=0.9578\n",
|
||||
"\n",
|
||||
"Example 3:\n",
|
||||
"Question: Who is the Ukrainian Ambassador to the United States?\n",
|
||||
"Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n",
|
||||
"Predicted Answer: I don't know.\n",
|
||||
"Predicted Scores: rouge=0.0000, chrf=0.0375, bert_score=0.3159, uni_eval=0.7493\n",
|
||||
"\n",
|
||||
"Example 4:\n",
|
||||
"Question: How many countries were part of the coalition formed to confront Putin?\n",
|
||||
"Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
|
||||
"Predicted Answer: The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n",
|
||||
"Predicted Scores: rouge=0.7419, chrf=0.8602, bert_score=0.8388, uni_eval=0.0669\n",
|
||||
"\n",
|
||||
"Example 5:\n",
|
||||
"Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n",
|
||||
"Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n",
|
||||
"Predicted Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n",
|
||||
"Predicted Scores: rouge=0.9412, chrf=0.8687, bert_score=0.9607, uni_eval=0.9718\n",
|
||||
"\n",
|
||||
"Example 6:\n",
|
||||
"Question: How much direct assistance is the United States providing to Ukraine?\n",
|
||||
"Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n",
|
||||
"Predicted Answer: The United States is providing more than $1 billion in direct assistance to Ukraine.\n",
|
||||
"Predicted Scores: rouge=1.0000, chrf=0.9483, bert_score=1.0000, uni_eval=0.9734\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i, eg in enumerate(examples):\n",
|
||||
" score_string = \", \".join(\n",
|
||||
" [f\"{k}={v['examples'][i]['value']:.4f}\" for k, v in eval_results.items()]\n",
|
||||
" )\n",
|
||||
" print(f\"Example {i}:\")\n",
|
||||
" print(\"Question: \" + predictions[i][\"query\"])\n",
|
||||
" print(\"Real Answer: \" + predictions[i][\"answer\"])\n",
|
||||
" print(\"Predicted Answer: \" + predictions[i][\"result\"])\n",
|
||||
" print(\"Predicted Scores: \" + score_string)\n",
|
||||
" print()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,975 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "692f3256",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Evaluating an OpenAPI Chain\n",
|
||||
"\n",
|
||||
"This notebook goes over ways to semantically evaluate an [OpenAPI Chain](/docs/modules/chains/additional/openapi.html), which calls an endpoint defined by the OpenAPI specification using purely natural language."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a457106d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.tools import OpenAPISpec, APIOperation\n",
|
||||
"from langchain.chains import OpenAPIEndpointChain, LLMChain\n",
|
||||
"from langchain.requests import Requests\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c3b0954",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load the API Chain\n",
|
||||
"\n",
|
||||
"Load a wrapper of the spec (so we can work with it more easily). You can load from a url or from a local file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "794142ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Attempting to load an OpenAPI 3.0.1 spec. This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load and parse the OpenAPI Spec\n",
|
||||
"spec = OpenAPISpec.from_url(\n",
|
||||
" \"https://www.klarna.com/us/shopping/public/openai/v0/api-docs/\"\n",
|
||||
")\n",
|
||||
"# Load a single endpoint operation\n",
|
||||
"operation = APIOperation.from_openapi_spec(spec, \"/public/openai/v0/products\", \"get\")\n",
|
||||
"verbose = False\n",
|
||||
"# Select any LangChain LLM\n",
|
||||
"llm = OpenAI(temperature=0, max_tokens=1000)\n",
|
||||
"# Create the endpoint chain\n",
|
||||
"api_chain = OpenAPIEndpointChain.from_api_operation(\n",
|
||||
" operation,\n",
|
||||
" llm,\n",
|
||||
" requests=Requests(),\n",
|
||||
" verbose=verbose,\n",
|
||||
" return_intermediate_steps=True, # Return request and response text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c05ba5b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### *Optional*: Generate Input Questions and Request Ground Truth Queries\n",
|
||||
"\n",
|
||||
"See [Generating Test Datasets](#Generating-Test-Datasets) at the end of this notebook for more details."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "a0c0cb7e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import re\n",
|
||||
"# from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"# template = \"\"\"Below is a service description:\n",
|
||||
"\n",
|
||||
"# {spec}\n",
|
||||
"\n",
|
||||
"# Imagine you're a new user trying to use {operation} through a search bar. What are 10 different things you want to request?\n",
|
||||
"# Wants/Questions:\n",
|
||||
"# 1. \"\"\"\n",
|
||||
"\n",
|
||||
"# prompt = PromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"# generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
|
||||
"\n",
|
||||
"# questions_ = generation_chain.run(spec=operation.to_typescript(), operation=operation.operation_id).split('\\n')\n",
|
||||
"# # Strip preceding numeric bullets\n",
|
||||
"# questions = [re.sub(r'^\\d+\\. ', '', q).strip() for q in questions_]\n",
|
||||
"# questions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f3d767ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ground_truths = [\n",
|
||||
"# {\"q\": ...} # What are the best queries for each input?\n",
|
||||
"# ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "81098a05",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run the API Chain\n",
|
||||
"\n",
|
||||
"The two simplest questions a user of the API Chain are:\n",
|
||||
"- Did the chain succesfully access the endpoint?\n",
|
||||
"- Did the action accomplish the correct result?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "64bc7ed9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"# Collect metrics to report at completion\n",
|
||||
"scores = defaultdict(list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "dfd2d09f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--openapi-chain-klarna-products-get-5d03362007667626/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "10932c9c139941d1a8be1a798f29e923",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"openapi-chain-klarna-products-get\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e08191a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'question': 'What iPhone models are available?',\n",
|
||||
" 'expected_query': {'max_price': None, 'q': 'iPhone'}},\n",
|
||||
" {'question': 'Are there any budget laptops?',\n",
|
||||
" 'expected_query': {'max_price': 300, 'q': 'laptop'}},\n",
|
||||
" {'question': 'Show me the cheapest gaming PC.',\n",
|
||||
" 'expected_query': {'max_price': 500, 'q': 'gaming pc'}},\n",
|
||||
" {'question': 'Are there any tablets under $400?',\n",
|
||||
" 'expected_query': {'max_price': 400, 'q': 'tablet'}},\n",
|
||||
" {'question': 'What are the best headphones?',\n",
|
||||
" 'expected_query': {'max_price': None, 'q': 'headphones'}},\n",
|
||||
" {'question': 'What are the top rated laptops?',\n",
|
||||
" 'expected_query': {'max_price': None, 'q': 'laptop'}},\n",
|
||||
" {'question': 'I want to buy some shoes. I like Adidas and Nike.',\n",
|
||||
" 'expected_query': {'max_price': None, 'q': 'shoe'}},\n",
|
||||
" {'question': 'I want to buy a new skirt',\n",
|
||||
" 'expected_query': {'max_price': None, 'q': 'skirt'}},\n",
|
||||
" {'question': 'My company is asking me to get a professional Deskopt PC - money is no object.',\n",
|
||||
" 'expected_query': {'max_price': 10000, 'q': 'professional desktop PC'}},\n",
|
||||
" {'question': 'What are the best budget cameras?',\n",
|
||||
" 'expected_query': {'max_price': 300, 'q': 'camera'}}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7ee71384",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"questions = [d[\"question\"] for d in dataset]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "00511f7a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Run the the API chain itself\n",
|
||||
"raise_error = False # Stop on first failed example - useful for development\n",
|
||||
"chain_outputs = []\n",
|
||||
"failed_examples = []\n",
|
||||
"for question in questions:\n",
|
||||
" try:\n",
|
||||
" chain_outputs.append(api_chain(question))\n",
|
||||
" scores[\"completed\"].append(1.0)\n",
|
||||
" except Exception as e:\n",
|
||||
" if raise_error:\n",
|
||||
" raise e\n",
|
||||
" failed_examples.append({\"q\": question, \"error\": e})\n",
|
||||
" scores[\"completed\"].append(0.0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "f3c9729f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# If the chain failed to run, show the failing examples\n",
|
||||
"failed_examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "914e7587",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['There are currently 10 Apple iPhone models available: Apple iPhone 14 Pro Max 256GB, Apple iPhone 12 128GB, Apple iPhone 13 128GB, Apple iPhone 14 Pro 128GB, Apple iPhone 14 Pro 256GB, Apple iPhone 14 Pro Max 128GB, Apple iPhone 13 Pro Max 128GB, Apple iPhone 14 128GB, Apple iPhone 12 Pro 512GB, and Apple iPhone 12 mini 64GB.',\n",
|
||||
" 'Yes, there are several budget laptops in the API response. For example, the HP 14-dq0055dx and HP 15-dw0083wm are both priced at $199.99 and $244.99 respectively.',\n",
|
||||
" 'The cheapest gaming PC available is the Alarco Gaming PC (X_BLACK_GTX750) for $499.99. You can find more information about it here: https://www.klarna.com/us/shopping/pl/cl223/3203154750/Desktop-Computers/Alarco-Gaming-PC-%28X_BLACK_GTX750%29/?utm_source=openai&ref-site=openai_plugin',\n",
|
||||
" 'Yes, there are several tablets under $400. These include the Apple iPad 10.2\" 32GB (2019), Samsung Galaxy Tab A8 10.5 SM-X200 32GB, Samsung Galaxy Tab A7 Lite 8.7 SM-T220 32GB, Amazon Fire HD 8\" 32GB (10th Generation), and Amazon Fire HD 10 32GB.',\n",
|
||||
" 'It looks like you are looking for the best headphones. Based on the API response, it looks like the Apple AirPods Pro (2nd generation) 2022, Apple AirPods Max, and Bose Noise Cancelling Headphones 700 are the best options.',\n",
|
||||
" 'The top rated laptops based on the API response are the Apple MacBook Pro (2021) M1 Pro 8C CPU 14C GPU 16GB 512GB SSD 14\", Apple MacBook Pro (2022) M2 OC 10C GPU 8GB 256GB SSD 13.3\", Apple MacBook Air (2022) M2 OC 8C GPU 8GB 256GB SSD 13.6\", and Apple MacBook Pro (2023) M2 Pro OC 16C GPU 16GB 512GB SSD 14.2\".',\n",
|
||||
" \"I found several Nike and Adidas shoes in the API response. Here are the links to the products: Nike Dunk Low M - Black/White: https://www.klarna.com/us/shopping/pl/cl337/3200177969/Shoes/Nike-Dunk-Low-M-Black-White/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 4 Retro M - Midnight Navy: https://www.klarna.com/us/shopping/pl/cl337/3202929835/Shoes/Nike-Air-Jordan-4-Retro-M-Midnight-Navy/?utm_source=openai&ref-site=openai_plugin, Nike Air Force 1 '07 M - White: https://www.klarna.com/us/shopping/pl/cl337/3979297/Shoes/Nike-Air-Force-1-07-M-White/?utm_source=openai&ref-site=openai_plugin, Nike Dunk Low W - White/Black: https://www.klarna.com/us/shopping/pl/cl337/3200134705/Shoes/Nike-Dunk-Low-W-White-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 1 Retro High M - White/University Blue/Black: https://www.klarna.com/us/shopping/pl/cl337/3200383658/Shoes/Nike-Air-Jordan-1-Retro-High-M-White-University-Blue-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 1 Retro High OG M - True Blue/Cement Grey/White: https://www.klarna.com/us/shopping/pl/cl337/3204655673/Shoes/Nike-Air-Jordan-1-Retro-High-OG-M-True-Blue-Cement-Grey-White/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 11 Retro Cherry - White/Varsity Red/Black: https://www.klarna.com/us/shopping/pl/cl337/3202929696/Shoes/Nike-Air-Jordan-11-Retro-Cherry-White-Varsity-Red-Black/?utm_source=openai&ref-site=openai_plugin, Nike Dunk High W - White/Black: https://www.klarna.com/us/shopping/pl/cl337/3201956448/Shoes/Nike-Dunk-High-W-White-Black/?utm_source=openai&ref-site=openai_plugin, Nike Air Jordan 5 Retro M - Black/Taxi/Aquatone: https://www.klarna.com/us/shopping/pl/cl337/3204923084/Shoes/Nike-Air-Jordan-5-Retro-M-Black-Taxi-Aquatone/?utm_source=openai&ref-site=openai_plugin, Nike Court Legacy Lift W: https://www.klarna.com/us/shopping/pl/cl337/3202103728/Shoes/Nike-Court-Legacy-Lift-W/?utm_source=openai&ref-site=openai_plugin\",\n",
|
||||
" \"I found several skirts that may interest you. Please take a look at the following products: Avenue Plus Size Denim Stretch Skirt, LoveShackFancy Ruffled Mini Skirt - Antique White, Nike Dri-Fit Club Golf Skirt - Active Pink, Skims Soft Lounge Ruched Long Skirt, French Toast Girl's Front Pleated Skirt with Tabs, Alexia Admor Women's Harmonie Mini Skirt Pink Pink, Vero Moda Long Skirt, Nike Court Dri-FIT Victory Flouncy Tennis Skirt Women - White/Black, Haoyuan Mini Pleated Skirts W, and Zimmermann Lyre Midi Skirt.\",\n",
|
||||
" 'Based on the API response, you may want to consider the Skytech Archangel Gaming Computer PC Desktop, the CyberPowerPC Gamer Master Gaming Desktop, or the ASUS ROG Strix G10DK-RS756, as they all offer powerful processors and plenty of RAM.',\n",
|
||||
" 'Based on the API response, the best budget cameras are the DJI Mini 2 Dog Camera ($448.50), Insta360 Sphere with Landing Pad ($429.99), DJI FPV Gimbal Camera ($121.06), Parrot Camera & Body ($36.19), and DJI FPV Air Unit ($179.00).']"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers = [res[\"output\"] for res in chain_outputs]\n",
|
||||
"answers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "484f0587",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate the requests chain\n",
|
||||
"\n",
|
||||
"The API Chain has two main components:\n",
|
||||
"1. Translate the user query to an API request (request synthesizer)\n",
|
||||
"2. Translate the API response to a natural language response\n",
|
||||
"\n",
|
||||
"Here, we construct an evaluation chain to grade the request synthesizer against selected human queries "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "3ea5afd7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
"truth_queries = [json.dumps(data[\"expected_query\"]) for data in dataset]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "e055f24b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Collect the API queries generated by the chain\n",
|
||||
"predicted_queries = [\n",
|
||||
" output[\"intermediate_steps\"][\"request_args\"] for output in chain_outputs\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "7d4f2b88",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"template = \"\"\"You are trying to answer the following question by querying an API:\n",
|
||||
"\n",
|
||||
"> Question: {question}\n",
|
||||
"\n",
|
||||
"The query you know you should be executing against the API is:\n",
|
||||
"\n",
|
||||
"> Query: {truth_query}\n",
|
||||
"\n",
|
||||
"Is the following predicted query semantically the same (eg likely to produce the same answer)?\n",
|
||||
"\n",
|
||||
"> Predicted Query: {predict_query}\n",
|
||||
"\n",
|
||||
"Please give the Predicted Query a grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: <the letter>'\n",
|
||||
"\n",
|
||||
"> Explanation: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "8cc1b1db",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
|
||||
" ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n",
|
||||
" \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n",
|
||||
" ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
|
||||
" ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n",
|
||||
" \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n",
|
||||
" ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n",
|
||||
" ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n",
|
||||
" ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n",
|
||||
" ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"request_eval_results = []\n",
|
||||
"for question, predict_query, truth_query in list(\n",
|
||||
" zip(questions, predicted_queries, truth_queries)\n",
|
||||
"):\n",
|
||||
" eval_output = eval_chain.run(\n",
|
||||
" question=question,\n",
|
||||
" truth_query=truth_query,\n",
|
||||
" predict_query=predict_query,\n",
|
||||
" )\n",
|
||||
" request_eval_results.append(eval_output)\n",
|
||||
"request_eval_results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "0d76f8ba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Parse the evaluation chain responses into a rubric\n",
|
||||
"def parse_eval_results(results: List[str]) -> List[float]:\n",
|
||||
" rubric = {\"A\": 1.0, \"B\": 0.75, \"C\": 0.5, \"D\": 0.25, \"F\": 0}\n",
|
||||
" return [rubric[re.search(r\"Final Grade: (\\w+)\", res).group(1)] for res in results]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"parsed_results = parse_eval_results(request_eval_results)\n",
|
||||
"# Collect the scores for a final evaluation table\n",
|
||||
"scores[\"request_synthesizer\"].extend(parsed_results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f3ee8ea",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate the Response Chain\n",
|
||||
"\n",
|
||||
"The second component translated the structured API response to a natural language response.\n",
|
||||
"Evaluate this against the user's original question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "8b97847c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"template = \"\"\"You are trying to answer the following question by querying an API:\n",
|
||||
"\n",
|
||||
"> Question: {question}\n",
|
||||
"\n",
|
||||
"The API returned a response of:\n",
|
||||
"\n",
|
||||
"> API result: {api_response}\n",
|
||||
"\n",
|
||||
"Your response to the user: {answer}\n",
|
||||
"\n",
|
||||
"Please evaluate the accuracy and utility of your response to the user's original question, conditioned on the information available.\n",
|
||||
"Give a letter grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: <the letter>'\n",
|
||||
"\n",
|
||||
"> Explanation: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "642852ce",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Extract the API responses from the chain\n",
|
||||
"api_responses = [\n",
|
||||
" output[\"intermediate_steps\"][\"response_text\"] for output in chain_outputs\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "08a5eb4f",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
|
||||
" ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n",
|
||||
" \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n",
|
||||
" ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n",
|
||||
" ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n",
|
||||
" \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n",
|
||||
" ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n",
|
||||
" ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n",
|
||||
" ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n",
|
||||
" ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F',\n",
|
||||
" ' The user asked a question about what iPhone models are available, and the API returned a response with 10 different models. The response provided by the user accurately listed all 10 models, so the accuracy of the response is A+. The utility of the response is also A+ since the user was able to get the exact information they were looking for. Final Grade: A+',\n",
|
||||
" \" The API response provided a list of laptops with their prices and attributes. The user asked if there were any budget laptops, and the response provided a list of laptops that are all priced under $500. Therefore, the response was accurate and useful in answering the user's question. Final Grade: A\",\n",
|
||||
" \" The API response provided the name, price, and URL of the product, which is exactly what the user asked for. The response also provided additional information about the product's attributes, which is useful for the user to make an informed decision. Therefore, the response is accurate and useful. Final Grade: A\",\n",
|
||||
" \" The API response provided a list of tablets that are under $400. The response accurately answered the user's question. Additionally, the response provided useful information such as the product name, price, and attributes. Therefore, the response was accurate and useful. Final Grade: A\",\n",
|
||||
" \" The API response provided a list of headphones with their respective prices and attributes. The user asked for the best headphones, so the response should include the best headphones based on the criteria provided. The response provided a list of headphones that are all from the same brand (Apple) and all have the same type of headphone (True Wireless, In-Ear). This does not provide the user with enough information to make an informed decision about which headphones are the best. Therefore, the response does not accurately answer the user's question. Final Grade: F\",\n",
|
||||
" ' The API response provided a list of laptops with their attributes, which is exactly what the user asked for. The response provided a comprehensive list of the top rated laptops, which is what the user was looking for. The response was accurate and useful, providing the user with the information they needed. Final Grade: A',\n",
|
||||
" ' The API response provided a list of shoes from both Adidas and Nike, which is exactly what the user asked for. The response also included the product name, price, and attributes for each shoe, which is useful information for the user to make an informed decision. The response also included links to the products, which is helpful for the user to purchase the shoes. Therefore, the response was accurate and useful. Final Grade: A',\n",
|
||||
" \" The API response provided a list of skirts that could potentially meet the user's needs. The response also included the name, price, and attributes of each skirt. This is a great start, as it provides the user with a variety of options to choose from. However, the response does not provide any images of the skirts, which would have been helpful for the user to make a decision. Additionally, the response does not provide any information about the availability of the skirts, which could be important for the user. \\n\\nFinal Grade: B\",\n",
|
||||
" ' The user asked for a professional desktop PC with no budget constraints. The API response provided a list of products that fit the criteria, including the Skytech Archangel Gaming Computer PC Desktop, the CyberPowerPC Gamer Master Gaming Desktop, and the ASUS ROG Strix G10DK-RS756. The response accurately suggested these three products as they all offer powerful processors and plenty of RAM. Therefore, the response is accurate and useful. Final Grade: A',\n",
|
||||
" \" The API response provided a list of cameras with their prices, which is exactly what the user asked for. The response also included additional information such as features and memory cards, which is not necessary for the user's question but could be useful for further research. The response was accurate and provided the user with the information they needed. Final Grade: A\"]"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Run the grader chain\n",
|
||||
"response_eval_results = []\n",
|
||||
"for question, api_response, answer in list(zip(questions, api_responses, answers)):\n",
|
||||
" request_eval_results.append(\n",
|
||||
" eval_chain.run(question=question, api_response=api_response, answer=answer)\n",
|
||||
" )\n",
|
||||
"request_eval_results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "a144aa9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Reusing the rubric from above, parse the evaluation chain responses\n",
|
||||
"parsed_response_results = parse_eval_results(request_eval_results)\n",
|
||||
"# Collect the scores for a final evaluation table\n",
|
||||
"scores[\"result_synthesizer\"].extend(parsed_response_results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "e95042bc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Metric \tMin \tMean \tMax \n",
|
||||
"completed \t1.00 \t1.00 \t1.00 \n",
|
||||
"request_synthesizer \t0.00 \t0.23 \t1.00 \n",
|
||||
"result_synthesizer \t0.00 \t0.55 \t1.00 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Print out Score statistics for the evaluation session\n",
|
||||
"header = \"{:<20}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Min\", \"Mean\", \"Max\")\n",
|
||||
"print(header)\n",
|
||||
"for metric, metric_scores in scores.items():\n",
|
||||
" mean_scores = (\n",
|
||||
" sum(metric_scores) / len(metric_scores)\n",
|
||||
" if len(metric_scores) > 0\n",
|
||||
" else float(\"nan\")\n",
|
||||
" )\n",
|
||||
" row = \"{:<20}\\t{:<10.2f}\\t{:<10.2f}\\t{:<10.2f}\".format(\n",
|
||||
" metric, min(metric_scores), mean_scores, max(metric_scores)\n",
|
||||
" )\n",
|
||||
" print(row)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "03fe96af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Re-show the examples for which the chain failed to complete\n",
|
||||
"failed_examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2bb3636d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Generating Test Datasets\n",
|
||||
"\n",
|
||||
"To evaluate a chain against your own endpoint, you'll want to generate a test dataset that's conforms to the API.\n",
|
||||
"\n",
|
||||
"This section provides an overview of how to bootstrap the process.\n",
|
||||
"\n",
|
||||
"First, we'll parse the OpenAPI Spec. For this example, we'll [Speak](https://www.speak.com/)'s OpenAPI specification."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "a453eb93",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Attempting to load an OpenAPI 3.0.1 spec. This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n",
|
||||
"Attempting to load an OpenAPI 3.0.1 spec. This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load and parse the OpenAPI Spec\n",
|
||||
"spec = OpenAPISpec.from_url(\"https://api.speak.com/openapi.yaml\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "bb65ffe8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['/v1/public/openai/explain-phrase',\n",
|
||||
" '/v1/public/openai/explain-task',\n",
|
||||
" '/v1/public/openai/translate']"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# List the paths in the OpenAPI Spec\n",
|
||||
"paths = sorted(spec.paths.keys())\n",
|
||||
"paths"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "0988f01b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['post']"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# See which HTTP Methods are available for a given path\n",
|
||||
"methods = spec.get_methods_for_path(\"/v1/public/openai/explain-task\")\n",
|
||||
"methods"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "e9ef0a77",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"type explainTask = (_: {\n",
|
||||
"/* Description of the task that the user wants to accomplish or do. For example, \"tell the waiter they messed up my order\" or \"compliment someone on their shirt\" */\n",
|
||||
" task_description?: string,\n",
|
||||
"/* The foreign language that the user is learning and asking about. The value can be inferred from question - for example, if the user asks \"how do i ask a girl out in mexico city\", the value should be \"Spanish\" because of Mexico City. Always use the full name of the language (e.g. Spanish, French). */\n",
|
||||
" learning_language?: string,\n",
|
||||
"/* The user's native language. Infer this value from the language the user asked their question in. Always use the full name of the language (e.g. Spanish, French). */\n",
|
||||
" native_language?: string,\n",
|
||||
"/* A description of any additional context in the user's question that could affect the explanation - e.g. setting, scenario, situation, tone, speaking style and formality, usage notes, or any other qualifiers. */\n",
|
||||
" additional_context?: string,\n",
|
||||
"/* Full text of the user's question. */\n",
|
||||
" full_query?: string,\n",
|
||||
"}) => any;\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load a single endpoint operation\n",
|
||||
"operation = APIOperation.from_openapi_spec(\n",
|
||||
" spec, \"/v1/public/openai/explain-task\", \"post\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# The operation can be serialized as typescript\n",
|
||||
"print(operation.to_typescript())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "f1186b6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compress the service definition to avoid leaking too much input structure to the sample data\n",
|
||||
"template = \"\"\"In 20 words or less, what does this service accomplish?\n",
|
||||
"{spec}\n",
|
||||
"\n",
|
||||
"Function: It's designed to \"\"\"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
|
||||
"purpose = generation_chain.run(spec=operation.to_typescript())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "a594406a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[\"Can you explain how to say 'hello' in Spanish?\",\n",
|
||||
" \"I need help understanding the French word for 'goodbye'.\",\n",
|
||||
" \"Can you tell me how to say 'thank you' in German?\",\n",
|
||||
" \"I'm trying to learn the Italian word for 'please'.\",\n",
|
||||
" \"Can you help me with the pronunciation of 'yes' in Portuguese?\",\n",
|
||||
" \"I'm looking for the Dutch word for 'no'.\",\n",
|
||||
" \"Can you explain the meaning of 'hello' in Japanese?\",\n",
|
||||
" \"I need help understanding the Russian word for 'thank you'.\",\n",
|
||||
" \"Can you tell me how to say 'goodbye' in Chinese?\",\n",
|
||||
" \"I'm trying to learn the Arabic word for 'please'.\"]"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"template = \"\"\"Write a list of {num_to_generate} unique messages users might send to a service designed to{purpose} They must each be completely unique.\n",
|
||||
"\n",
|
||||
"1.\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def parse_list(text: str) -> List[str]:\n",
|
||||
" # Match lines starting with a number then period\n",
|
||||
" # Strip leading and trailing whitespace\n",
|
||||
" matches = re.findall(r\"^\\d+\\. \", text)\n",
|
||||
" return [re.sub(r\"^\\d+\\. \", \"\", q).strip().strip('\"') for q in text.split(\"\\n\")]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"num_to_generate = 10 # How many examples to use for this test set.\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"generation_chain = LLMChain(llm=llm, prompt=prompt)\n",
|
||||
"text = generation_chain.run(purpose=purpose, num_to_generate=num_to_generate)\n",
|
||||
"# Strip preceding numeric bullets\n",
|
||||
"queries = parse_list(text)\n",
|
||||
"queries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "8dc60f43",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n",
|
||||
" '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n",
|
||||
" '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n",
|
||||
" '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n",
|
||||
" '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n",
|
||||
" '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n",
|
||||
" '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n",
|
||||
" '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n",
|
||||
" '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n",
|
||||
" '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Define the generation chain to get hypotheses\n",
|
||||
"api_chain = OpenAPIEndpointChain.from_api_operation(\n",
|
||||
" operation,\n",
|
||||
" llm,\n",
|
||||
" requests=Requests(),\n",
|
||||
" verbose=verbose,\n",
|
||||
" return_intermediate_steps=True, # Return request and response text\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"predicted_outputs = [api_chain(query) for query in queries]\n",
|
||||
"request_args = [\n",
|
||||
" output[\"intermediate_steps\"][\"request_args\"] for output in predicted_outputs\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# Show the generated request\n",
|
||||
"request_args"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "b727e28e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## AI Assisted Correction\n",
|
||||
"correction_template = \"\"\"Correct the following API request based on the user's feedback. If the user indicates no changes are needed, output the original without making any changes.\n",
|
||||
"\n",
|
||||
"REQUEST: {request}\n",
|
||||
"\n",
|
||||
"User Feedback / requested changes: {user_feedback}\n",
|
||||
"\n",
|
||||
"Finalized Request: \"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(correction_template)\n",
|
||||
"correction_chain = LLMChain(llm=llm, prompt=prompt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "c1f4d71f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Query: Can you explain how to say 'hello' in Spanish?\n",
|
||||
"Request: {\"task_description\": \"say 'hello'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say 'hello' in Spanish?\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: I need help understanding the French word for 'goodbye'.\n",
|
||||
"Request: {\"task_description\": \"understanding the French word for 'goodbye'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for 'goodbye'.\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: Can you tell me how to say 'thank you' in German?\n",
|
||||
"Request: {\"task_description\": \"say 'thank you'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'thank you' in German?\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: I'm trying to learn the Italian word for 'please'.\n",
|
||||
"Request: {\"task_description\": \"Learn the Italian word for 'please'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Italian word for 'please'.\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: Can you help me with the pronunciation of 'yes' in Portuguese?\n",
|
||||
"Request: {\"task_description\": \"Help with pronunciation of 'yes' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of 'yes' in Portuguese?\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: I'm looking for the Dutch word for 'no'.\n",
|
||||
"Request: {\"task_description\": \"Find the Dutch word for 'no'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I'm looking for the Dutch word for 'no'.\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: Can you explain the meaning of 'hello' in Japanese?\n",
|
||||
"Request: {\"task_description\": \"Explain the meaning of 'hello' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of 'hello' in Japanese?\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: I need help understanding the Russian word for 'thank you'.\n",
|
||||
"Request: {\"task_description\": \"understanding the Russian word for 'thank you'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for 'thank you'.\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: Can you tell me how to say 'goodbye' in Chinese?\n",
|
||||
"Request: {\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'goodbye' in Chinese?\"}\n",
|
||||
"Requested changes: \n",
|
||||
"Query: I'm trying to learn the Arabic word for 'please'.\n",
|
||||
"Request: {\"task_description\": \"Learn the Arabic word for 'please'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Arabic word for 'please'.\"}\n",
|
||||
"Requested changes: \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ground_truth = []\n",
|
||||
"for query, request_arg in list(zip(queries, request_args)):\n",
|
||||
" feedback = input(f\"Query: {query}\\nRequest: {request_arg}\\nRequested changes: \")\n",
|
||||
" if feedback == \"n\" or feedback == \"none\" or not feedback:\n",
|
||||
" ground_truth.append(request_arg)\n",
|
||||
" continue\n",
|
||||
" resolved = correction_chain.run(request=request_arg, user_feedback=feedback)\n",
|
||||
" ground_truth.append(resolved.strip())\n",
|
||||
" print(\"Updated request:\", resolved)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19d68882",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Now you can use the `ground_truth` as shown above in [Evaluate the Requests Chain](#Evaluate-the-requests-chain)!**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "5a596176",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n",
|
||||
" '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n",
|
||||
" '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n",
|
||||
" '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n",
|
||||
" '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n",
|
||||
" '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n",
|
||||
" '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n",
|
||||
" '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n",
|
||||
" '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n",
|
||||
" '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Now you have a new ground truth set to use as shown above!\n",
|
||||
"ground_truth"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b7fe9dfa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,372 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "984169ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question Answering Benchmarking: Paul Graham Essay\n",
|
||||
"\n",
|
||||
"Here we go over how to benchmark performance on a question answering task over a Paul Graham essay.\n",
|
||||
"\n",
|
||||
"It is highly recommended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/docs/modules/callbacks/how_to/tracing) for an explanation of what tracing is and how to set it up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a16b75d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the data\n",
|
||||
"First, let's load the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "5b2d5e98",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--question-answering-paul-graham-76e8f711e038d742/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "9264acfe710b4faabf060f0fcf4f7308",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"question-answering-paul-graham\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4ab6a716",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setting up a chain\n",
|
||||
"Now we need to create some pipelines for doing question answering. Step one in that is creating an index over the data in question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c18680b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/paul_graham_essay.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "7f0de2b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.indexes import VectorstoreIndexCreator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "ef84ff99",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore = VectorstoreIndexCreator().from_loaders([loader]).vectorstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0b5d8f6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create a question answering chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "8843cb0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "573719a0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = RetrievalQA.from_chain_type(\n",
|
||||
" llm=OpenAI(),\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" retriever=vectorstore.as_retriever(),\n",
|
||||
" input_key=\"question\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "53b5aa23",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make a prediction\n",
|
||||
"\n",
|
||||
"First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "3f81d951",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What were the two main things the author worked on before college?',\n",
|
||||
" 'answer': 'The two main things the author worked on before college were writing and programming.',\n",
|
||||
" 'result': ' Writing and programming.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain(dataset[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d0c16cd7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make many predictions\n",
|
||||
"Now we can make predictions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "24b4c66e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = chain.apply(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "49d969fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate performance\n",
|
||||
"Now we can evaluate the predictions. The first thing we can do is look at them by eye."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "1d583f03",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What were the two main things the author worked on before college?',\n",
|
||||
" 'answer': 'The two main things the author worked on before college were writing and programming.',\n",
|
||||
" 'result': ' Writing and programming.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"predictions[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4783344b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we can use a language model to score them programatically"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "d0a9341d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "1612dec1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79587806",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can add in the graded output to the `predictions` dict and then get a count of the grades."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "2a689df5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i, prediction in enumerate(predictions):\n",
|
||||
" prediction[\"grade\"] = graded_outputs[i][\"text\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "27b61215",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({' CORRECT': 12, ' INCORRECT': 10})"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"Counter([pred[\"grade\"] for pred in predictions])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12fe30f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also filter the datapoints to the incorrect examples and look at them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "47c692a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "0ef976c1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What did the author write their dissertation on?',\n",
|
||||
" 'answer': 'The author wrote their dissertation on applications of continuations.',\n",
|
||||
" 'result': ' The author does not mention what their dissertation was on, so it is not known.',\n",
|
||||
" 'grade': ' INCORRECT'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"incorrect[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7710401a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,385 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "984169ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question Answering Benchmarking: State of the Union Address\n",
|
||||
"\n",
|
||||
"Here we go over how to benchmark performance on a question answering task over a state of the union address.\n",
|
||||
"\n",
|
||||
"It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f127fb04",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Comment this out if you are NOT using tracing\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a16b75d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the data\n",
|
||||
"First, let's load the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "5b2d5e98",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--question-answering-state-of-the-union-a7e5a3b2db4f440d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"question-answering-state-of-the-union\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4ab6a716",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setting up a chain\n",
|
||||
"Now we need to create some pipelines for doing question answering. Step one in that is creating an index over the data in question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c18680b5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "7f0de2b3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.indexes import VectorstoreIndexCreator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "ef84ff99",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore = VectorstoreIndexCreator().from_loaders([loader]).vectorstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0b5d8f6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create a question answering chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "8843cb0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "573719a0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = RetrievalQA.from_chain_type(\n",
|
||||
" llm=OpenAI(),\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" retriever=vectorstore.as_retriever(),\n",
|
||||
" input_key=\"question\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "37d669e9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make a prediction\n",
|
||||
"\n",
|
||||
"First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "3089e409",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the purpose of the NATO Alliance?',\n",
|
||||
" 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
|
||||
" 'result': ' The NATO Alliance was created to secure peace and stability in Europe after World War 2.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain(dataset[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d0c16cd7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make many predictions\n",
|
||||
"Now we can make predictions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "24b4c66e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = chain.apply(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "49d969fb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate performance\n",
|
||||
"Now we can evaluate the predictions. The first thing we can do is look at them by eye."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "1d583f03",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the purpose of the NATO Alliance?',\n",
|
||||
" 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n",
|
||||
" 'result': ' The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"predictions[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4783344b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we can use a language model to score them programatically"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "d0a9341d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "1612dec1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79587806",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can add in the graded output to the `predictions` dict and then get a count of the grades."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "2a689df5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i, prediction in enumerate(predictions):\n",
|
||||
" prediction[\"grade\"] = graded_outputs[i][\"text\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "27b61215",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({' CORRECT': 7, ' INCORRECT': 4})"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"Counter([pred[\"grade\"] for pred in predictions])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12fe30f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also filter the datapoints to the incorrect examples and look at them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "47c692a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "0ef976c1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the U.S. Department of Justice doing to combat the crimes of Russian oligarchs?',\n",
|
||||
" 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs.',\n",
|
||||
" 'result': ' The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and is naming a chief prosecutor for pandemic fraud.',\n",
|
||||
" 'grade': ' INCORRECT'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"incorrect[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7710401a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,118 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ee2a3a21",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# QA Generation\n",
|
||||
"This notebook shows how to use the `QAGenerationChain` to come up with question-answer pairs over a specific document.\n",
|
||||
"This is important because often times you may not have data to evaluate your question-answer system over, so this is a cheap and lightweight way to generate it!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "33d3f0b4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2029a29c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "87edb84c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc = loader.load()[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "04125b6d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.chains import QAGenerationChain\n",
|
||||
"\n",
|
||||
"chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "4f1593e4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"qa = chain.run(doc.page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "ee831f92",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What is the U.S. Department of Justice doing to combat the crimes of Russian oligarchs?',\n",
|
||||
" 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"qa[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7028754e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,445 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "480b7cf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Question Answering\n",
|
||||
"\n",
|
||||
"This notebook covers how to evaluate generic question answering problems. This is a situation where you have an example containing a question and its corresponding ground truth answer, and you want to measure how well the language model does at answering those questions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "78e3023b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"For demonstration purposes, we will just evaluate a simple question answering system that only evaluates the model's internal knowledge. Please see other notebooks for examples where it evaluates how the model does at question answering over data not present in what the model was trained on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "96710d50",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"from langchain.chains import LLMChain\n",
|
||||
"from langchain.llms import OpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "e33ccf00",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"Question: {question}\\nAnswer:\", input_variables=[\"question\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "172d993a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(model_name=\"text-davinci-003\", temperature=0)\n",
|
||||
"chain = LLMChain(llm=llm, prompt=prompt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c584440",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Examples\n",
|
||||
"For this purpose, we will just use two simple hardcoded examples, but see other notebooks for tips on how to get and/or generate these examples."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "87de1d84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"examples = [\n",
|
||||
" {\n",
|
||||
" \"question\": \"Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?\",\n",
|
||||
" \"answer\": \"11\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"question\": 'Is the following sentence plausible? \"Joao Moutinho caught the screen pass in the NFC championship.\"',\n",
|
||||
" \"answer\": \"No\",\n",
|
||||
" },\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "143b1155",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Predictions\n",
|
||||
"\n",
|
||||
"We can now make and inspect the predictions for these questions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c7bd809c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = chain.apply(examples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f06dceab",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'text': ' 11 tennis balls'},\n",
|
||||
" {'text': ' No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"predictions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "45cc2f9d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluation\n",
|
||||
"\n",
|
||||
"We can see that if we tried to just do exact match on the answer answers (`11` and `No`) they would not match what the language model answered. However, semantically the language model is correct in both cases. In order to account for this, we can use a language model itself to evaluate the answers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0cacc65a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "5aa6cd65",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" examples, predictions, question_key=\"question\", prediction_key=\"text\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "63780020",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Example 0:\n",
|
||||
"Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?\n",
|
||||
"Real Answer: 11\n",
|
||||
"Predicted Answer: 11 tennis balls\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n",
|
||||
"Example 1:\n",
|
||||
"Question: Is the following sentence plausible? \"Joao Moutinho caught the screen pass in the NFC championship.\"\n",
|
||||
"Real Answer: No\n",
|
||||
"Predicted Answer: No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.\n",
|
||||
"Predicted Grade: CORRECT\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i, eg in enumerate(examples):\n",
|
||||
" print(f\"Example {i}:\")\n",
|
||||
" print(\"Question: \" + eg[\"question\"])\n",
|
||||
" print(\"Real Answer: \" + eg[\"answer\"])\n",
|
||||
" print(\"Predicted Answer: \" + predictions[i][\"text\"])\n",
|
||||
" print(\"Predicted Grade: \" + graded_outputs[i][\"text\"])\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "782ae8c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Customize Prompt\n",
|
||||
"\n",
|
||||
"You can also customize the prompt that is used. Here is an example prompting it using a score from 0 to 10.\n",
|
||||
"The custom prompt requires 3 input variables: \"query\", \"answer\" and \"result\". Where \"query\" is the question, \"answer\" is the ground truth answer, and \"result\" is the predicted answer."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "153425c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"\n",
|
||||
"_PROMPT_TEMPLATE = \"\"\"You are an expert professor specialized in grading students' answers to questions.\n",
|
||||
"You are grading the following question:\n",
|
||||
"{query}\n",
|
||||
"Here is the real answer:\n",
|
||||
"{answer}\n",
|
||||
"You are grading the following predicted answer:\n",
|
||||
"{result}\n",
|
||||
"What grade do you give from 0 to 10, where 0 is the lowest (very low similarity) and 10 is the highest (very high similarity)?\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"PROMPT = PromptTemplate(\n",
|
||||
" input_variables=[\"query\", \"answer\", \"result\"], template=_PROMPT_TEMPLATE\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0a3b0fb7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evalchain = QAEvalChain.from_llm(llm=llm, prompt=PROMPT)\n",
|
||||
"evalchain.evaluate(\n",
|
||||
" examples,\n",
|
||||
" predictions,\n",
|
||||
" question_key=\"question\",\n",
|
||||
" answer_key=\"answer\",\n",
|
||||
" prediction_key=\"text\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb1cf335",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluation without Ground Truth\n",
|
||||
"Its possible to evaluate question answering systems without ground truth. You would need a `\"context\"` input that reflects what the information the LLM uses to answer the question. This context can be obtained by any retreival system. Here's an example of how it works:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6c59293f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"context_examples = [\n",
|
||||
" {\n",
|
||||
" \"question\": \"How old am I?\",\n",
|
||||
" \"context\": \"I am 30 years old. I live in New York and take the train to work everyday.\",\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"question\": 'Who won the NFC championship game in 2023?\"',\n",
|
||||
" \"context\": \"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
|
||||
" },\n",
|
||||
"]\n",
|
||||
"QA_PROMPT = \"Answer the question based on the context\\nContext:{context}\\nQuestion:{question}\\nAnswer:\"\n",
|
||||
"template = PromptTemplate(input_variables=[\"context\", \"question\"], template=QA_PROMPT)\n",
|
||||
"qa_chain = LLMChain(llm=llm, prompt=template)\n",
|
||||
"predictions = qa_chain.apply(context_examples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "e500d0cc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'text': 'You are 30 years old.'},\n",
|
||||
" {'text': ' The Philadelphia Eagles won the NFC championship game in 2023.'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"predictions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "6d8cbc1d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import ContextQAEvalChain\n",
|
||||
"\n",
|
||||
"eval_chain = ContextQAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" context_examples, predictions, question_key=\"question\", prediction_key=\"text\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "6c5262d0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'text': ' CORRECT'}, {'text': ' CORRECT'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graded_outputs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "aaa61f0c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Comparing to other evaluation metrics\n",
|
||||
"We can compare the evaluation results we get to other common evaluation metrics. To do this, let's load some evaluation metrics from HuggingFace's `evaluate` package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "d851453b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Some data munging to get the examples in the right format\n",
|
||||
"for i, eg in enumerate(examples):\n",
|
||||
" eg[\"id\"] = str(i)\n",
|
||||
" eg[\"answers\"] = {\"text\": [eg[\"answer\"]], \"answer_start\": [0]}\n",
|
||||
" predictions[i][\"id\"] = str(i)\n",
|
||||
" predictions[i][\"prediction_text\"] = predictions[i][\"text\"]\n",
|
||||
"\n",
|
||||
"for p in predictions:\n",
|
||||
" del p[\"text\"]\n",
|
||||
"\n",
|
||||
"new_examples = examples.copy()\n",
|
||||
"for eg in new_examples:\n",
|
||||
" del eg[\"question\"]\n",
|
||||
" del eg[\"answer\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "c38eb3e9",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from evaluate import load\n",
|
||||
"\n",
|
||||
"squad_metric = load(\"squad\")\n",
|
||||
"results = squad_metric.compute(\n",
|
||||
" references=new_examples,\n",
|
||||
" predictions=predictions,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "07d68f85",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'exact_match': 0.0, 'f1': 28.125}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3b775150",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "53f3bc57609c7a84333bb558594977aa5b4026b1d6070b93987956689e367341"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,428 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "984169ca",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SQL Question Answering Benchmarking: Chinook\n",
|
||||
"\n",
|
||||
"Here we go over how to benchmark performance on a question answering task over a SQL database.\n",
|
||||
"\n",
|
||||
"It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "44874486",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Comment this out if you are NOT using tracing\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f66405e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the data\n",
|
||||
"\n",
|
||||
"First, let's load the data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0df1393f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "b220d07ee5d14909bc842b4545cdc0de",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Downloading readme: 0%| | 0.00/21.0 [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading and preparing dataset json/LangChainDatasets--sql-qa-chinook to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--sql-qa-chinook-7528565d2d992b47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e89e3c8ef76f49889c4b39c624828c71",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "a8421df6c26045e8978c7086cb418222",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Downloading data: 0%| | 0.00/1.44k [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d1fb6becc3324a85bf039a53caf30924",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"Generating train split: 0 examples [00:00, ? examples/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--sql-qa-chinook-7528565d2d992b47/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "9d68ad1b3e4a4bd79f92597aac4d3cc9",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation.loading import load_dataset\n",
|
||||
"\n",
|
||||
"dataset = load_dataset(\"sql-qa-chinook\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ab44d504",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'How many employees are there?', 'answer': '8'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a16b75d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setting up a chain\n",
|
||||
"This uses the example Chinook database.\n",
|
||||
"To set it up follow the instructions on https://database.guide/2-sample-databases-sqlite/, placing the `.db` file in a notebooks folder at the root of this repository.\n",
|
||||
"\n",
|
||||
"Note that here we load a simple chain. If you want to experiment with more complex chains, or an agent, just create the `chain` object in a different way."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "5b2d5e98",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import OpenAI, SQLDatabase, SQLDatabaseChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "33cdcbfc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = SQLDatabase.from_uri(\"sqlite:///../../../notebooks/Chinook.db\")\n",
|
||||
"llm = OpenAI(temperature=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0b5d8f6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create a SQL database chain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "8843cb0c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = SQLDatabaseChain.from_llm(llm, db, input_key=\"question\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c0062e7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make a prediction\n",
|
||||
"\n",
|
||||
"First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "d28c5e7d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'How many employees are there?',\n",
|
||||
" 'answer': '8',\n",
|
||||
" 'result': ' There are 8 employees.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain(dataset[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d0c16cd7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make many predictions\n",
|
||||
"Now we can make predictions. Note that we add a try-except because this chain can sometimes error (if SQL is written incorrectly, etc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "24b4c66e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = []\n",
|
||||
"predicted_dataset = []\n",
|
||||
"error_dataset = []\n",
|
||||
"for data in dataset:\n",
|
||||
" try:\n",
|
||||
" predictions.append(chain(data))\n",
|
||||
" predicted_dataset.append(data)\n",
|
||||
" except:\n",
|
||||
" error_dataset.append(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4783344b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluate performance\n",
|
||||
"Now we can evaluate the predictions. We can use a language model to score them programatically"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "d0a9341d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa import QAEvalChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "1612dec1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"eval_chain = QAEvalChain.from_llm(llm)\n",
|
||||
"graded_outputs = eval_chain.evaluate(\n",
|
||||
" predicted_dataset, predictions, question_key=\"question\", prediction_key=\"result\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79587806",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can add in the graded output to the `predictions` dict and then get a count of the grades."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "2a689df5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i, prediction in enumerate(predictions):\n",
|
||||
" prediction[\"grade\"] = graded_outputs[i][\"text\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "27b61215",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Counter({' CORRECT': 3, ' INCORRECT': 4})"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"Counter([pred[\"grade\"] for pred in predictions])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12fe30f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also filter the datapoints to the incorrect examples and look at them."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "47c692a1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "0ef976c1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'How many employees are also customers?',\n",
|
||||
" 'answer': 'None',\n",
|
||||
" 'result': ' 59 employees are also customers.',\n",
|
||||
" 'grade': ' INCORRECT'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"incorrect[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7710401a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,430 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4cf569a7-9a1d-4489-934e-50e57760c907",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Evaluating Custom Criteria\n",
|
||||
"\n",
|
||||
"Suppose you want to test a model's output against a custom rubric or custom set of criteria, how would you go about testing this?\n",
|
||||
"\n",
|
||||
"The `criteria` evaluator is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
|
||||
"properly define those criteria.\n",
|
||||
"\n",
|
||||
"For more details, check out the reference docs for the [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain)'s class definition.\n",
|
||||
"\n",
|
||||
"### Without References\n",
|
||||
"\n",
|
||||
"In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are \"concise\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "6005ebe8-551e-47a5-b4df-80575a068552",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"criteria\", criteria=\"conciseness\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "22f83fb8-82f4-4310-a877-68aaa0789199",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': 'The criterion is conciseness. This means the submission should be brief and to the point. \\n\\nLooking at the submission, the answer to the task is included, but there is additional commentary that is not necessary to answer the question. The phrase \"That\\'s an elementary question\" and \"The answer you\\'re looking for is\" could be removed and the answer would still be clear and correct. \\n\\nTherefore, the submission is not concise and does not meet the criterion. \\n\\nN', 'value': 'N', 'score': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
|
||||
" input=\"What's 2+2?\",\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43397a9f-ccca-4f91-b0e1-df0cada2efb1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Default Criteria**\n",
|
||||
"\n",
|
||||
"Most of the time, you'll want to define your own custom criteria (see below), but we also provide some common criteria you can load with a single string.\n",
|
||||
"Here's a list of pre-implemented criteria:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['conciseness',\n",
|
||||
" 'relevance',\n",
|
||||
" 'correctness',\n",
|
||||
" 'coherence',\n",
|
||||
" 'harmfulness',\n",
|
||||
" 'maliciousness',\n",
|
||||
" 'helpfulness',\n",
|
||||
" 'controversiality',\n",
|
||||
" 'mysogyny',\n",
|
||||
" 'criminality',\n",
|
||||
" 'insensitive']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation import CriteriaEvalChain\n",
|
||||
"\n",
|
||||
"# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
|
||||
"CriteriaEvalChain.get_supported_default_criteria()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Reference Labels\n",
|
||||
"\n",
|
||||
"Some criteria (such as correctness) require reference labels to work correctly. To do this, initialize with `requires_reference=True` and call the evaluator with a `reference` string."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"With ground truth: 1\n",
|
||||
"Without ground truth: 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator = load_evaluator(\"criteria\", criteria=\"correctness\", requires_reference=True)\n",
|
||||
"\n",
|
||||
"# We can even override the model's learned knowledge using ground truth labels\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" input=\"What is the capital of the US?\",\n",
|
||||
" prediction=\"Topeka, KS\",\n",
|
||||
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
|
||||
")\n",
|
||||
"print(f'With ground truth: {eval_result[\"score\"]}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "077c4715-e857-44a3-9f87-346642586a8d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Custom Criteria\n",
|
||||
"\n",
|
||||
"To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `\"criterion_name\": \"criterion_description\"`\n",
|
||||
"\n",
|
||||
"Note: the evaluator still predicts whether the output complies with ALL of the criteria provided. If you specify antagonistic criteria / antonyms, the evaluator won't be very useful."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "bafa0a11-2617-4663-84bf-24df7d0736be",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': 'The criterion is asking if the output contains numeric information. The submission does mention the \"late 16th century,\" which is a numeric information. Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"custom_criterion = {\"numeric\": \"Does the output contain numeric information?\"}\n",
|
||||
"\n",
|
||||
"eval_chain = load_evaluator(\n",
|
||||
" EvaluatorType.CRITERIA,\n",
|
||||
" llm=eval_llm,\n",
|
||||
" criteria=custom_criterion,\n",
|
||||
")\n",
|
||||
"eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Constitutional Principles\n",
|
||||
"\n",
|
||||
"Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to\n",
|
||||
"instantiate the chain and take advantage of the many existing principles in LangChain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "99e3c242-5b12-4bd5-b487-64990a159655",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"54 available principles\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('harmful1',\n",
|
||||
" ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),\n",
|
||||
" ('harmful2',\n",
|
||||
" ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),\n",
|
||||
" ('harmful3',\n",
|
||||
" ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),\n",
|
||||
" ('harmful4',\n",
|
||||
" ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),\n",
|
||||
" ('insensitive',\n",
|
||||
" ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains.constitutional_ai.principles import PRINCIPLES\n",
|
||||
"\n",
|
||||
"print(f\"{len(PRINCIPLES)} available principles\")\n",
|
||||
"list(PRINCIPLES.items())[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nThe assistant\\'s response is \"I say that man is a lilly-livered nincompoop\". This is a negative statement about a person named Will. \\n\\nThe term \"lilly-livered\" is an old-fashioned term meaning cowardly, and \"nincompoop\" is a term meaning a foolish or stupid person. \\n\\nWhile this is a negative and potentially insulting statement, it does not appear to be harmful in a physical sense, unethical, racist, sexist, toxic, dangerous, or illegal. \\n\\nIt could be argued that the statement is harmful in a psychological or emotional sense, as it is insulting. However, without further context, it\\'s hard to definitively say if this meets the criteria of being harmful.\\n\\nN',\n",
|
||||
" 'value': 'N',\n",
|
||||
" 'score': 0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain = load_evaluator(\n",
|
||||
" EvaluatorType.CRITERIA, llm=eval_llm, criteria=PRINCIPLES[\"harmful1\"]\n",
|
||||
")\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"I say that man is a lilly-livered nincompoop\",\n",
|
||||
" input=\"What do you think of Will?\",\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae60b5e3-ceac-46b1-aabb-ee36930cb57c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Configuring the LLM\n",
|
||||
"\n",
|
||||
"If you don't specify an eval LLM, the `load_evaluator` method will initialize a `gpt-4` LLM to power the grading chain. Below, use an anthropic model instead."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "1717162d-f76c-4a14-9ade-168d6fa42b7a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install ChatAnthropic\n",
|
||||
"# %env ANTHROPIC_API_KEY=<API_KEY>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "8727e6f4-aaba-472d-bb7d-09fc1a0f0e2a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatAnthropic\n",
|
||||
"\n",
|
||||
"llm = ChatAnthropic(temperature=0)\n",
|
||||
"evaluator = load_evaluator(\"criteria\", llm=llm, criteria=\"conciseness\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "3f6f0d8b-cf42-4241-85ae-35b3ce8152a0",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': 'Here is my step-by-step reasoning for each criterion:\\n\\nconciseness: The submission is not concise. It contains unnecessary words and phrases like \"That\\'s an elementary question\" and \"you\\'re looking for\". The answer could have simply been stated as \"4\" to be concise.\\n\\nN', 'value': 'N', 'score': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
|
||||
" input=\"What's 2+2?\",\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5e7fc7bb-3075-4b44-9c16-3146a39ae497",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Configuring the Prompt\n",
|
||||
"\n",
|
||||
"If you want to completely customize the prompt, you can initialize the evaluator with a custom prompt template as follows."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "22e57704-682f-44ff-96ba-e915c73269c0",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"fstring = \"\"\"Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:\n",
|
||||
"\n",
|
||||
"Grading Rubric: {criteria}\n",
|
||||
"Expected Response: {reference}\n",
|
||||
"\n",
|
||||
"DATA:\n",
|
||||
"---------\n",
|
||||
"Question: {input}\n",
|
||||
"Response: {output}\n",
|
||||
"---------\n",
|
||||
"Write out your explanation for each criterion, then respond with Y or N on a new line.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(fstring)\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"criteria\", criteria=\"correctness\", prompt=prompt, requires_reference=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "5d6b0eca-7aea-4073-a65a-18c3a9cdb5af",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': 'Correctness: No, the submission is not correct. The expected response was \"It\\'s 17 now.\" but the response given was \"What\\'s 2+2? That\\'s an elementary question. The answer you\\'re looking for is that two and two is four.\"', 'value': 'N', 'score': 0}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
|
||||
" input=\"What's 2+2?\",\n",
|
||||
" reference=\"It's 17 now.\",\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f2662405-353a-4a73-b867-784d12cafcf1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Conclusion\n",
|
||||
"\n",
|
||||
"In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.\n",
|
||||
"\n",
|
||||
"Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,208 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4460f924-1738-4dc5-999f-c26383aba0a4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Custom String Evaluator\n",
|
||||
"\n",
|
||||
"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
|
||||
"\n",
|
||||
"In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
|
||||
"[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install evaluate > /dev/null"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Any, Optional\n",
|
||||
"\n",
|
||||
"from langchain.evaluation import StringEvaluator\n",
|
||||
"from evaluate import load\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class PerplexityEvaluator(StringEvaluator):\n",
|
||||
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, model_id: str = \"gpt2\"):\n",
|
||||
" self.model_id = model_id\n",
|
||||
" self.metric_fn = load(\n",
|
||||
" \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def _evaluate_strings(\n",
|
||||
" self,\n",
|
||||
" *,\n",
|
||||
" prediction: str,\n",
|
||||
" reference: Optional[str] = None,\n",
|
||||
" input: Optional[str] = None,\n",
|
||||
" **kwargs: Any,\n",
|
||||
" ) -> dict:\n",
|
||||
" results = self.metric_fn.compute(\n",
|
||||
" predictions=[prediction], model_id=self.model_id\n",
|
||||
" )\n",
|
||||
" ppl = results[\"perplexities\"][0]\n",
|
||||
" return {\"score\": ppl}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluator = PerplexityEvaluator()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using pad_token, but it is not set yet.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
||||
"To disable this warning, you can either:\n",
|
||||
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
||||
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "467109d44654486e8b415288a319fc2c",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 190.3675537109375}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using pad_token, but it is not set yet.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 1982.0709228515625}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
|
||||
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,223 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Embedding Distance\n",
|
||||
"\n",
|
||||
"To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
|
||||
"\n",
|
||||
"Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"embedding_distance\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.0966466944859925}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.03761174337464557}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Select the Distance Metric\n",
|
||||
"\n",
|
||||
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
||||
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
||||
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
||||
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
||||
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation import EmbeddingDistance\n",
|
||||
"\n",
|
||||
"list(EmbeddingDistance)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# You can load by enum or by raw python string\n",
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Select Embeddings to Use\n",
|
||||
"\n",
|
||||
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"embedding_model = HuggingFaceEmbeddings()\n",
|
||||
"hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.5486443280477362}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.21018880025138598}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@@ -1,227 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d63696a8-d035-4cf7-9605-c3210f0b551d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# QA Correctness\n",
|
||||
"\n",
|
||||
"When thinking about a QA system, one of the most important questions to ask is whether the final generated result is correct. The `\"qa\"` evaluator compares a question-answering model's response to a reference answer to provide this level of information. If you are able to annotate a test dataset, this evaluator will be useful.\n",
|
||||
"\n",
|
||||
"For more details, check out the reference docs for the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain)'s class definition."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9672fdb9-b53f-41e4-8f72-f21d11edbeac",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
|
||||
"\n",
|
||||
"# Note: the eval_llm is optional. A gpt-4 model will be provided by default if not specified\n",
|
||||
"evaluator = load_evaluator(\"qa\", eval_llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "b4db474a-9c9d-473f-81b1-55070ee584a6",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_strings(\n",
|
||||
" input=\"What's last quarter's sales numbers?\",\n",
|
||||
" prediction=\"Last quarter we sold 600,000 total units of product.\",\n",
|
||||
" reference=\"Last quarter we sold 100,000 units of product A, 210,000 units of product B, and 300,000 units of product C.\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a5b345aa-7f45-4eea-bedf-9b0d5e824be3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SQL Correctness\n",
|
||||
"\n",
|
||||
"You can use an LLM to check the equivalence of a SQL query against a reference SQL query using the sql prompt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "6c803b8c-fe1f-4fb7-8ea0-d9c67b855eb3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation.qa.eval_prompt import SQL_PROMPT\n",
|
||||
"\n",
|
||||
"eval_chain = load_evaluator(\"qa\", eval_llm=llm, prompt=SQL_PROMPT)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e28b8d07-248f-405c-bcef-e0ebe3a05c3e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'The expert answer and the submission are very similar in their structure and logic. Both queries are trying to calculate the sum of sales amounts for the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\\n\\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\\n\\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of the sales amounts for the last quarter.\\n\\nCORRECT',\n",
|
||||
" 'value': 'CORRECT',\n",
|
||||
" 'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain.evaluate_strings(\n",
|
||||
" input=\"What's last quarter's sales numbers?\",\n",
|
||||
" prediction=\"\"\"SELECT SUM(sale_amount) AS last_quarter_sales\n",
|
||||
"FROM sales\n",
|
||||
"WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE();\n",
|
||||
"\"\"\",\n",
|
||||
" reference=\"\"\"SELECT SUM(sub.sale_amount) AS last_quarter_sales\n",
|
||||
"FROM (\n",
|
||||
" SELECT sale_amount\n",
|
||||
" FROM sales\n",
|
||||
" WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE()\n",
|
||||
") AS sub;\n",
|
||||
"\"\"\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e0c3dcad-408e-4d26-9e25-848ebacac2c4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Context\n",
|
||||
"\n",
|
||||
"Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the [ContextQAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.ContextQAEvalChain.html#langchain.evaluation.qa.eval_chain.ContextQAEvalChain)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "9f3ae116-3a2f-461d-ba6f-7352b42c1b0c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': None, 'value': 'CORRECT', 'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain = load_evaluator(\"context_qa\", eval_llm=llm)\n",
|
||||
"\n",
|
||||
"eval_chain.evaluate_strings(\n",
|
||||
" input=\"Who won the NFC championship game in 2023?\",\n",
|
||||
" prediction=\"Eagles\",\n",
|
||||
" reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ba5eac17-08b6-4e4f-a896-79e7fc637018",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## CoT With Context\n",
|
||||
"\n",
|
||||
"The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.\n",
|
||||
"The [CotQAEvalChain's](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.CotQAEvalChain.html#langchain.evaluation.qa.eval_chain.CotQAEvalChain) default prompt instructs the model to do this."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "26e3b686-98f4-45a5-9854-7071ec2893f1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'reasoning': 'The student\\'s answer is \"Eagles\". The context states that the Philadelphia Eagles won the NFC championship game in 2023. Therefore, the student\\'s answer matches the information provided in the context.',\n",
|
||||
" 'value': 'GRADE: CORRECT',\n",
|
||||
" 'score': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_chain = load_evaluator(\"cot_qa\", eval_llm=llm)\n",
|
||||
"\n",
|
||||
"eval_chain.evaluate_strings(\n",
|
||||
" input=\"Who won the NFC championship game in 2023?\",\n",
|
||||
" prediction=\"Eagles\",\n",
|
||||
" reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,222 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2da95378",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# String Distance\n",
|
||||
"\n",
|
||||
"One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance. This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
|
||||
"\n",
|
||||
"This can be accessed using the `string_distance` evaluator, which uses distance metric's from the [rapidfuzz](https://github.com/maxbachmann/RapidFuzz) library.\n",
|
||||
"\n",
|
||||
"**Note:** The returned scores are _distances_, meaning lower is typically \"better\".\n",
|
||||
"\n",
|
||||
"For more information, check out the reference docs for the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "8b47b909-3251-4774-9a7d-e436da4f8979",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install rapidfuzz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f6790c46",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"string_distance\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "49ad9139",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 12}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator.evaluate_strings(\n",
|
||||
" prediction=\"The job is completely done.\",\n",
|
||||
" reference=\"The job is done\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "c06a2296",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 4}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The results purely character-based, so it's less useful when negation is concerned\n",
|
||||
"evaluator.evaluate_strings(\n",
|
||||
" prediction=\"The job is done.\",\n",
|
||||
" reference=\"The job isn't done\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure the String Distance Metric\n",
|
||||
"\n",
|
||||
"By default, the `StringDistanceEvalChain` uses levenshtein distance, but it also supports other string distance algorithms. Configure using the `distance` argument."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "a88bc7d7-62d3-408d-b0e0-43abcecf35c8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[<StringDistance.DAMERAU_LEVENSHTEIN: 'damerau_levenshtein'>,\n",
|
||||
" <StringDistance.LEVENSHTEIN: 'levenshtein'>,\n",
|
||||
" <StringDistance.JARO: 'jaro'>,\n",
|
||||
" <StringDistance.JARO_WINKLER: 'jaro_winkler'>]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.evaluation import StringDistance\n",
|
||||
"\n",
|
||||
"list(StringDistance)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"jaro_evaluator = load_evaluator(\n",
|
||||
" \"string_distance\", distance=StringDistance.JARO, requires_reference=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.19259259259259254}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"jaro_evaluator.evaluate_strings(\n",
|
||||
" prediction=\"The job is completely done.\",\n",
|
||||
" reference=\"The job is done\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7020b046-0ef7-40cc-8778-b928e35f3ce1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 0.12083333333333324}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"jaro_evaluator.evaluate_strings(\n",
|
||||
" prediction=\"The job is done.\",\n",
|
||||
" reference=\"The job isn't done\",\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,141 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "db9d627f-b234-4f7f-ab96-639fae474122",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Custom Trajectory Evaluator\n",
|
||||
"\n",
|
||||
"You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "ca84ab0c-e7e2-4c03-bd74-9cc4e6338eec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Any, Optional, Sequence, Tuple\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.chains import LLMChain\n",
|
||||
"from langchain.schema import AgentAction\n",
|
||||
"from langchain.evaluation import AgentTrajectoryEvaluator\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class StepNecessityEvaluator(AgentTrajectoryEvaluator):\n",
|
||||
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self) -> None:\n",
|
||||
" llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
|
||||
" template = \"\"\"Are any of the following steps unnecessary in answering {input}? Provide the verdict on a new line as a single \"Y\" for yes or \"N\" for no.\n",
|
||||
"\n",
|
||||
" DATA\n",
|
||||
" ------\n",
|
||||
" Steps: {trajectory}\n",
|
||||
" ------\n",
|
||||
"\n",
|
||||
" Verdict:\"\"\"\n",
|
||||
" self.chain = LLMChain.from_string(llm, template)\n",
|
||||
"\n",
|
||||
" def _evaluate_agent_trajectory(\n",
|
||||
" self,\n",
|
||||
" *,\n",
|
||||
" prediction: str,\n",
|
||||
" input: str,\n",
|
||||
" agent_trajectory: Sequence[Tuple[AgentAction, str]],\n",
|
||||
" reference: Optional[str] = None,\n",
|
||||
" **kwargs: Any,\n",
|
||||
" ) -> dict:\n",
|
||||
" vals = [\n",
|
||||
" f\"{i}: Action=[{action.tool}] returned observation = [{observation}]\"\n",
|
||||
" for i, (action, observation) in enumerate(agent_trajectory)\n",
|
||||
" ]\n",
|
||||
" trajectory = \"\\n\".join(vals)\n",
|
||||
" response = self.chain.run(dict(trajectory=trajectory, input=input), **kwargs)\n",
|
||||
" decision = response.split(\"\\n\")[-1].strip()\n",
|
||||
" score = 1 if decision == \"Y\" else 0\n",
|
||||
" return {\"score\": score, \"value\": decision, \"reasoning\": response}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary.\n",
|
||||
"\n",
|
||||
"You can call this evaluator to grade the intermediate steps of your agent's trajectory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "a3fbcc1d-249f-4e00-8841-b6872c73c486",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'score': 1, 'value': 'Y', 'reasoning': 'Y'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluator = StepNecessityEvaluator()\n",
|
||||
"\n",
|
||||
"evaluator.evaluate_agent_trajectory(\n",
|
||||
" prediction=\"The answer is pi\",\n",
|
||||
" input=\"What is today?\",\n",
|
||||
" agent_trajectory=[\n",
|
||||
" (\n",
|
||||
" AgentAction(tool=\"ask\", tool_input=\"What is today?\", log=\"\"),\n",
|
||||
" \"tomorrow's yesterday\",\n",
|
||||
" ),\n",
|
||||
" (\n",
|
||||
" AgentAction(tool=\"check_tv\", tool_input=\"Watch tv for half hour\", log=\"\"),\n",
|
||||
" \"bzzz\",\n",
|
||||
" ),\n",
|
||||
" ],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "77353528-723e-4075-939e-aebdb17c1e4f",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@@ -1,287 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6e5ea1a1-7e74-459b-bf14-688f87d09124",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Agent Trajectory\n",
|
||||
"\n",
|
||||
"Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
|
||||
"\n",
|
||||
"Evaluators that do this can implement the `AgentTrajectoryEvaluator` interface. This walkthrough will show how to use the `trajectory` evaluator to grade an OpenAI functions agent.\n",
|
||||
"\n",
|
||||
"For more information, check out the reference docs for the [TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "149402da-5212-43e2-b7c0-a701727f5293",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"trajectory\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e733562c-4c17-4942-9647-acfc5ebfaca2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Capturing Trajectory\n",
|
||||
"\n",
|
||||
"The easiest way to return an agent's trajectory (without using tracing callbacks like those in LangSmith) for evaluation is to initialize the agent with `return_intermediate_steps=True`.\n",
|
||||
"\n",
|
||||
"Below, create an example agent we will call to evaluate."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.tools import tool\n",
|
||||
"from langchain.agents import AgentType, initialize_agent\n",
|
||||
"from pydantic import HttpUrl\n",
|
||||
"import subprocess\n",
|
||||
"from urllib.parse import urlparse\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def ping(url: HttpUrl, return_error: bool) -> str:\n",
|
||||
" \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n",
|
||||
" hostname = urlparse(str(url)).netloc\n",
|
||||
" completed_process = subprocess.run(\n",
|
||||
" [\"ping\", \"-c\", \"1\", hostname], capture_output=True, text=True\n",
|
||||
" )\n",
|
||||
" output = completed_process.stdout\n",
|
||||
" if return_error and completed_process.returncode != 0:\n",
|
||||
" return completed_process.stderr\n",
|
||||
" return output\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def trace_route(url: HttpUrl, return_error: bool) -> str:\n",
|
||||
" \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n",
|
||||
" hostname = urlparse(str(url)).netloc\n",
|
||||
" completed_process = subprocess.run(\n",
|
||||
" [\"traceroute\", hostname], capture_output=True, text=True\n",
|
||||
" )\n",
|
||||
" output = completed_process.stdout\n",
|
||||
" if return_error and completed_process.returncode != 0:\n",
|
||||
" return completed_process.stderr\n",
|
||||
" return output\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
|
||||
"agent = initialize_agent(\n",
|
||||
" llm=llm,\n",
|
||||
" tools=[ping, trace_route],\n",
|
||||
" agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n",
|
||||
" return_intermediate_steps=True, # IMPORTANT!\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = agent(\"What's the latency like for https://langchain.com?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Evaluate Trajectory\n",
|
||||
"\n",
|
||||
"Pass the input, trajectory, and pass to the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator.evaluate_agent_trajectory) method."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Type <class 'langchain.agents.openai_functions_multi_agent.base._FunctionsAgentAction'> not serializable\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluation_result = evaluator.evaluate_agent_trajectory(\n",
|
||||
" prediction=result[\"output\"],\n",
|
||||
" input=result[\"input\"],\n",
|
||||
" agent_trajectory=result[\"intermediate_steps\"],\n",
|
||||
")\n",
|
||||
"evaluation_result[\"score\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fc5467c1-ea92-405f-949a-3011388fa9ee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring the Evaluation LLM\n",
|
||||
"\n",
|
||||
"If you don't select an LLM to use for evaluation, the [load_evaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.loading.load_evaluator.html#langchain.evaluation.loading.load_evaluator) function will use `gpt-4` to power the evaluation chain. You can select any chat model for the agent trajectory evaluator as below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "1f6318f3-642a-4766-bc7a-f91239795ee7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install anthropic\n",
|
||||
"# ANTHROPIC_API_KEY=<YOUR ANTHROPIC API KEY>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "b2852289-5df9-402e-95b5-7efebf0fc943",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatAnthropic\n",
|
||||
"\n",
|
||||
"eval_llm = ChatAnthropic(temperature=0)\n",
|
||||
"evaluator = load_evaluator(\"trajectory\", llm=eval_llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "ff72d21a-93b9-4c2f-8613-733d9c9330d7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluation_result = evaluator.evaluate_agent_trajectory(\n",
|
||||
" prediction=result[\"output\"],\n",
|
||||
" input=result[\"input\"],\n",
|
||||
" agent_trajectory=result[\"intermediate_steps\"],\n",
|
||||
")\n",
|
||||
"evaluation_result[\"score\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "95ce4240-f5a0-4810-8d09-b2f4c9e18b7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Providing List of Valid Tools\n",
|
||||
"\n",
|
||||
"By default, the evaluator doesn't take into account the tools the agent is permitted to call. You can provide these to the evaluator via the `agent_tools` argument.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "24c10566-2ef5-45c5-9213-a8fb28e2ca1f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"trajectory\", agent_tools=[ping, trace_route])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7b995786-5b78-4d9e-8e8a-1f2a203113e2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"evaluation_result = evaluator.evaluate_agent_trajectory(\n",
|
||||
" prediction=result[\"output\"],\n",
|
||||
" input=result[\"input\"],\n",
|
||||
" agent_trajectory=result[\"intermediate_steps\"],\n",
|
||||
")\n",
|
||||
"evaluation_result[\"score\"]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Reference in New Issue
Block a user