diff --git a/docs/docs_skeleton/docs/modules/evaluation/comparison/index.mdx b/docs/docs_skeleton/docs/modules/evaluation/comparison/index.mdx new file mode 100644 index 00000000000..be6fdf58a9d --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/comparison/index.mdx @@ -0,0 +1,8 @@ +--- +sidebar_position: 3 +--- +# Comparison + +import DocCardList from "@theme/DocCardList"; + + \ No newline at end of file diff --git a/docs/docs_skeleton/docs/modules/evaluation/how_to/custom.mdx b/docs/docs_skeleton/docs/modules/evaluation/how_to/custom.mdx deleted file mode 100644 index a49b1420df3..00000000000 --- a/docs/docs_skeleton/docs/modules/evaluation/how_to/custom.mdx +++ /dev/null @@ -1,3 +0,0 @@ -# Creating a Custom Eval Chain - - diff --git a/docs/docs_skeleton/docs/modules/evaluation/how_to/custom_evaluator.mdx b/docs/docs_skeleton/docs/modules/evaluation/how_to/custom_evaluator.mdx new file mode 100644 index 00000000000..2a3fe89359d --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/custom_evaluator.mdx @@ -0,0 +1,4 @@ +--- +sidebar_position: 3 +--- +# Custom Evaluator \ No newline at end of file diff --git a/docs/docs_skeleton/docs/modules/evaluation/how_to/generating_examples.mdx b/docs/docs_skeleton/docs/modules/evaluation/how_to/generating_examples.mdx new file mode 100644 index 00000000000..342aa5acf2c --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/generating_examples.mdx @@ -0,0 +1,6 @@ +--- +sidebar_position: 2 +--- + +# Generating Examples + diff --git a/docs/docs_skeleton/docs/modules/evaluation/how_to/index.mdx b/docs/docs_skeleton/docs/modules/evaluation/how_to/index.mdx new file mode 100644 index 00000000000..eade7c88945 --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/how_to/index.mdx @@ -0,0 +1,8 @@ +--- +sidebar_position: 5 +--- +# How To + +import DocCardList from "@theme/DocCardList"; + + diff --git a/docs/docs_skeleton/docs/modules/evaluation/string/index.mdx b/docs/docs_skeleton/docs/modules/evaluation/string/index.mdx new file mode 100644 index 00000000000..359a56510cd --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/string/index.mdx @@ -0,0 +1,8 @@ +--- +sidebar_position: 2 +--- +# String Evaluators + +import DocCardList from "@theme/DocCardList"; + + \ No newline at end of file diff --git a/docs/docs_skeleton/docs/modules/evaluation/trajectory/index.mdx b/docs/docs_skeleton/docs/modules/evaluation/trajectory/index.mdx new file mode 100644 index 00000000000..dede3a3a20c --- /dev/null +++ b/docs/docs_skeleton/docs/modules/evaluation/trajectory/index.mdx @@ -0,0 +1,8 @@ +--- +sidebar_position: 4 +--- +# Agent Trajectory + +import DocCardList from "@theme/DocCardList"; + + \ No newline at end of file diff --git a/docs/extras/modules/evaluation/agent_benchmarking.ipynb b/docs/extras/modules/evaluation/agent_benchmarking.ipynb deleted file mode 100644 index a5b5d3e19cb..00000000000 --- a/docs/extras/modules/evaluation/agent_benchmarking.ipynb +++ /dev/null @@ -1,301 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "984169ca", - "metadata": {}, - "source": [ - "# Agent Benchmarking: Search + Calculator\n", - "\n", - "Here we go over how to benchmark performance of an agent on tasks where it has access to a calculator and a search tool.\n", - "\n", - "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/docs/guides/tracing/) for an explanation of what tracing is and how to set it up." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46bf9205", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Comment this out if you are NOT using tracing\n", - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" - ] - }, - { - "cell_type": "markdown", - "id": "8a16b75d", - "metadata": {}, - "source": [ - "## Loading the data\n", - "First, let's load the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b2d5e98", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.evaluation.loading import load_dataset\n", - "\n", - "dataset = load_dataset(\"agent-search-calculator\")" - ] - }, - { - "cell_type": "markdown", - "id": "4ab6a716", - "metadata": {}, - "source": [ - "## Setting up a chain\n", - "Now we need to load an agent capable of answering these questions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c18680b5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.llms import OpenAI\n", - "from langchain.chains import LLMMathChain\n", - "from langchain.agents import initialize_agent, Tool, load_tools\n", - "from langchain.agents import AgentType\n", - "\n", - "tools = load_tools([\"serpapi\", \"llm-math\"], llm=OpenAI(temperature=0))\n", - "agent = initialize_agent(\n", - " tools,\n", - " OpenAI(temperature=0),\n", - " agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n", - " verbose=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "68504a8f", - "metadata": {}, - "source": [ - "## Make a prediction\n", - "\n", - "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cbcafc92", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(dataset[0][\"question\"])\n", - "agent.run(dataset[0][\"question\"])" - ] - }, - { - "cell_type": "markdown", - "id": "d0c16cd7", - "metadata": {}, - "source": [ - "## Make many predictions\n", - "Now we can make predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbbbb20e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "agent.run(dataset[4][\"question\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24b4c66e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "predictions = []\n", - "predicted_dataset = []\n", - "error_dataset = []\n", - "for data in dataset:\n", - " new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n", - " try:\n", - " predictions.append(agent(new_data))\n", - " predicted_dataset.append(new_data)\n", - " except Exception as e:\n", - " predictions.append({\"output\": str(e), **new_data})\n", - " error_dataset.append(new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "49d969fb", - "metadata": {}, - "source": [ - "## Evaluate performance\n", - "Now we can evaluate the predictions. The first thing we can do is look at them by eye." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d583f03", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "predictions[0]" - ] - }, - { - "cell_type": "markdown", - "id": "4783344b", - "metadata": {}, - "source": [ - "Next, we can use a language model to score them programatically" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0a9341d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.evaluation.qa import QAEvalChain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1612dec1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "llm = OpenAI(temperature=0)\n", - "eval_chain = QAEvalChain.from_llm(llm)\n", - "graded_outputs = eval_chain.evaluate(\n", - " dataset, predictions, question_key=\"question\", prediction_key=\"output\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "79587806", - "metadata": {}, - "source": [ - "We can add in the graded output to the `predictions` dict and then get a count of the grades." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a689df5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "for i, prediction in enumerate(predictions):\n", - " prediction[\"grade\"] = graded_outputs[i][\"text\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27b61215", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from collections import Counter\n", - "\n", - "Counter([pred[\"grade\"] for pred in predictions])" - ] - }, - { - "cell_type": "markdown", - "id": "12fe30f4", - "metadata": {}, - "source": [ - "We can also filter the datapoints to the incorrect examples and look at them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47c692a1", - "metadata": {}, - "outputs": [], - "source": [ - "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ef976c1", - "metadata": {}, - "outputs": [], - "source": [ - "incorrect" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3eb948cf-f767-4c87-a12d-275b66eef407", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/agent_vectordb_sota_pg.ipynb b/docs/extras/modules/evaluation/agent_vectordb_sota_pg.ipynb deleted file mode 100644 index 6e326ac4725..00000000000 --- a/docs/extras/modules/evaluation/agent_vectordb_sota_pg.ipynb +++ /dev/null @@ -1,524 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "984169ca", - "metadata": {}, - "source": [ - "# Agent VectorDB Question Answering Benchmarking\n", - "\n", - "Here we go over how to benchmark performance on a question answering task using an agent to route between multiple vectordatabases.\n", - "\n", - "It is highly recommended that you do any evaluation/benchmarking with tracing enabled. See [here](https://python.langchain.com/guides/tracing/) for an explanation of what tracing is and how to set it up." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "7b57a50f", - "metadata": {}, - "outputs": [], - "source": [ - "# Comment this out if you are NOT using tracing\n", - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" - ] - }, - { - "cell_type": "markdown", - "id": "8a16b75d", - "metadata": {}, - "source": [ - "## Loading the data\n", - "First, let's load the data." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5b2d5e98", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset json (/Users/qt/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-vectordb-qa-sota-pg-d3ae24016b514f92/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)\n", - "100%|██████████| 1/1 [00:00<00:00, 414.42it/s]\n" - ] - } - ], - "source": [ - "from langchain.evaluation.loading import load_dataset\n", - "\n", - "dataset = load_dataset(\"agent-vectordb-qa-sota-pg\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "61375342", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'What is the purpose of the NATO Alliance?',\n", - " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n", - " 'steps': [{'tool': 'State of Union QA System', 'tool_input': None},\n", - " {'tool': None, 'tool_input': 'What is the purpose of the NATO Alliance?'}]}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "02500304", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'question': 'What is the purpose of YC?',\n", - " 'answer': 'The purpose of YC is to cause startups to be founded that would not otherwise have existed.',\n", - " 'steps': [{'tool': 'Paul Graham QA System', 'tool_input': None},\n", - " {'tool': None, 'tool_input': 'What is the purpose of YC?'}]}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset[-1]" - ] - }, - { - "cell_type": "markdown", - "id": "4ab6a716", - "metadata": {}, - "source": [ - "## Setting up a chain\n", - "Now we need to create some pipelines for doing question answering. Step one in that is creating indexes over the data in question." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c18680b5", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "\n", - "loader = TextLoader(\"../../modules/state_of_the_union.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "7f0de2b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.indexes import VectorstoreIndexCreator" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ef84ff99", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using embedded DuckDB without persistence: data will be transient\n" - ] - } - ], - "source": [ - "vectorstore_sota = (\n", - " VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"sota\"})\n", - " .from_loaders([loader])\n", - " .vectorstore\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "f0b5d8f6", - "metadata": {}, - "source": [ - "Now we can create a question answering chain." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8843cb0c", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import RetrievalQA\n", - "from langchain.llms import OpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "573719a0", - "metadata": {}, - "outputs": [], - "source": [ - "chain_sota = RetrievalQA.from_chain_type(\n", - " llm=OpenAI(temperature=0),\n", - " chain_type=\"stuff\",\n", - " retriever=vectorstore_sota.as_retriever(),\n", - " input_key=\"question\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "e48b03d8", - "metadata": {}, - "source": [ - "Now we do the same for the Paul Graham data." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "c2dbb014", - "metadata": {}, - "outputs": [], - "source": [ - "loader = TextLoader(\"../../modules/paul_graham_essay.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "98d16f08", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using embedded DuckDB without persistence: data will be transient\n" - ] - } - ], - "source": [ - "vectorstore_pg = (\n", - " VectorstoreIndexCreator(vectorstore_kwargs={\"collection_name\": \"paul_graham\"})\n", - " .from_loaders([loader])\n", - " .vectorstore\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "ec0aab02", - "metadata": {}, - "outputs": [], - "source": [ - "chain_pg = RetrievalQA.from_chain_type(\n", - " llm=OpenAI(temperature=0),\n", - " chain_type=\"stuff\",\n", - " retriever=vectorstore_pg.as_retriever(),\n", - " input_key=\"question\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "76b5f8fb", - "metadata": {}, - "source": [ - "We can now set up an agent to route between them." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ade1aafa", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.agents import initialize_agent, Tool\n", - "from langchain.agents import AgentType\n", - "\n", - "tools = [\n", - " Tool(\n", - " name=\"State of Union QA System\",\n", - " func=chain_sota.run,\n", - " description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question.\",\n", - " ),\n", - " Tool(\n", - " name=\"Paul Graham System\",\n", - " func=chain_pg.run,\n", - " description=\"useful for when you need to answer questions about Paul Graham. Input should be a fully formed question.\",\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "104853f8", - "metadata": {}, - "outputs": [], - "source": [ - "agent = initialize_agent(\n", - " tools,\n", - " OpenAI(temperature=0),\n", - " agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n", - " max_iterations=4,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "7f036641", - "metadata": {}, - "source": [ - "## Make a prediction\n", - "\n", - "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "4664e79f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agent.run(dataset[0][\"question\"])" - ] - }, - { - "cell_type": "markdown", - "id": "d0c16cd7", - "metadata": {}, - "source": [ - "## Make many predictions\n", - "Now we can make predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "799f6c17", - "metadata": {}, - "outputs": [], - "source": [ - "predictions = []\n", - "predicted_dataset = []\n", - "error_dataset = []\n", - "for data in dataset:\n", - " new_data = {\"input\": data[\"question\"], \"answer\": data[\"answer\"]}\n", - " try:\n", - " predictions.append(agent(new_data))\n", - " predicted_dataset.append(new_data)\n", - " except Exception:\n", - " error_dataset.append(new_data)" - ] - }, - { - "cell_type": "markdown", - "id": "49d969fb", - "metadata": {}, - "source": [ - "## Evaluate performance\n", - "Now we can evaluate the predictions. The first thing we can do is look at them by eye." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "1d583f03", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': 'What is the purpose of the NATO Alliance?',\n", - " 'answer': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.',\n", - " 'output': 'The purpose of the NATO Alliance is to secure peace and stability in Europe after World War 2.'}" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predictions[0]" - ] - }, - { - "cell_type": "markdown", - "id": "4783344b", - "metadata": {}, - "source": [ - "Next, we can use a language model to score them programatically" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "d0a9341d", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.evaluation.qa import QAEvalChain" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "1612dec1", - "metadata": {}, - "outputs": [], - "source": [ - "llm = OpenAI(temperature=0)\n", - "eval_chain = QAEvalChain.from_llm(llm)\n", - "graded_outputs = eval_chain.evaluate(\n", - " predicted_dataset, predictions, question_key=\"input\", prediction_key=\"output\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "79587806", - "metadata": {}, - "source": [ - "We can add in the graded output to the `predictions` dict and then get a count of the grades." - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "2a689df5", - "metadata": {}, - "outputs": [], - "source": [ - "for i, prediction in enumerate(predictions):\n", - " prediction[\"grade\"] = graded_outputs[i][\"text\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "27b61215", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Counter({' CORRECT': 28, ' INCORRECT': 5})" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from collections import Counter\n", - "\n", - "Counter([pred[\"grade\"] for pred in predictions])" - ] - }, - { - "cell_type": "markdown", - "id": "12fe30f4", - "metadata": {}, - "source": [ - "We can also filter the datapoints to the incorrect examples and look at them." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "47c692a1", - "metadata": {}, - "outputs": [], - "source": [ - "incorrect = [pred for pred in predictions if pred[\"grade\"] == \" INCORRECT\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "0ef976c1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': 'What are the four common sense steps that the author suggests to move forward safely?',\n", - " 'answer': 'The four common sense steps suggested by the author to move forward safely are: stay protected with vaccines and treatments, prepare for new variants, end the shutdown of schools and businesses, and stay vigilant.',\n", - " 'output': 'The four common sense steps suggested in the most recent State of the Union address are: cutting the cost of prescription drugs, providing a pathway to citizenship for Dreamers, revising laws so businesses have the workers they need and families don’t wait decades to reunite, and protecting access to health care and preserving a woman’s right to choose.',\n", - " 'grade': ' INCORRECT'}" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "incorrect[0]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.15" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/benchmarking_template.ipynb b/docs/extras/modules/evaluation/benchmarking_template.ipynb deleted file mode 100644 index 7605fe6d30a..00000000000 --- a/docs/extras/modules/evaluation/benchmarking_template.ipynb +++ /dev/null @@ -1,162 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a175c650", - "metadata": {}, - "source": [ - "# Benchmarking Template\n", - "\n", - "This is an example notebook that can be used to create a benchmarking notebook for a task of your choice. Evaluation is really hard, and so we greatly welcome any contributions that can make it easier for people to experiment" - ] - }, - { - "cell_type": "markdown", - "id": "984169ca", - "metadata": {}, - "source": [ - "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "9fe4d1b4", - "metadata": {}, - "outputs": [], - "source": [ - "# Comment this out if you are NOT using tracing\n", - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" - ] - }, - { - "cell_type": "markdown", - "id": "0f66405e", - "metadata": {}, - "source": [ - "## Loading the data\n", - "\n", - "First, let's load the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79402a8f", - "metadata": {}, - "outputs": [], - "source": [ - "# This notebook should so how to load the dataset from LangChainDatasets on Hugging Face\n", - "\n", - "# Please upload your dataset to https://huggingface.co/LangChainDatasets\n", - "\n", - "# The value passed into `load_dataset` should NOT have the `LangChainDatasets/` prefix\n", - "from langchain.evaluation.loading import load_dataset\n", - "\n", - "dataset = load_dataset(\"TODO\")" - ] - }, - { - "cell_type": "markdown", - "id": "8a16b75d", - "metadata": {}, - "source": [ - "## Setting up a chain\n", - "\n", - "This next section should have an example of setting up a chain that can be run on this dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2661ce0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "6c0062e7", - "metadata": {}, - "source": [ - "## Make a prediction\n", - "\n", - "First, we can make predictions one datapoint at a time. Doing it at this level of granularity allows use to explore the outputs in detail, and also is a lot cheaper than running over multiple datapoints" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "d28c5e7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Example of running the chain on a single datapoint (`dataset[0]`) goes here" - ] - }, - { - "cell_type": "markdown", - "id": "d0c16cd7", - "metadata": {}, - "source": [ - "## Make many predictions\n", - "Now we can make predictions." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "24b4c66e", - "metadata": {}, - "outputs": [], - "source": [ - "# Example of running the chain on many predictions goes here\n", - "\n", - "# Sometimes its as simple as `chain.apply(dataset)`\n", - "\n", - "# Othertimes you may want to write a for loop to catch errors" - ] - }, - { - "cell_type": "markdown", - "id": "4783344b", - "metadata": {}, - "source": [ - "## Evaluate performance\n", - "\n", - "Any guide to evaluating performance in a more systematic manner goes here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7710401a", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb b/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb index f52c7d84ff6..131e7db3968 100644 --- a/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb +++ b/docs/extras/modules/evaluation/comparison/pairwise_string.ipynb @@ -1,55 +1,51 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "2da95378", "metadata": {}, "source": [ "# Pairwise String Comparison\n", "\n", - "Often you will want to compare predictions of an LLM, Chain, or Agent on for a given input. The comparison evaluators facilitate this so you can answer questions like:\n", + "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The comparison evaluators facilitate this so you can answer questions like:\n", "- Which LLM or Prompt produces a preferred output for a given question?\n", "- Which completions should I include for few-shot example selection?\n", - "- Which output is better to include for fintetuning?" + "- Which output is better to include for fintetuning?\n", + "\n", + "You can use the PairwiseStringEvalChain to do this." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "f6790c46", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.6) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n", - " warnings.warn(\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from langchain.chat_models import ChatOpenAI\n", "from langchain.evaluation import PairwiseStringEvalChain\n", "\n", - "llm = ChatOpenAI(model=\"gpt-4\")\n", + "llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n", "\n", - "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)" + "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True)" ] }, { "cell_type": "code", "execution_count": 5, "id": "49ad9139", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ - "{'reasoning': \"Both responses A and B accurately answer the question, but neither response provides any additional detail or context. Response A is slightly more complete, as it uses full sentences to convey the information, while response B provides just the number. However, both responses are fairly equal in relevance, accuracy, and depth. The lack of detail in both responses doesn't allow for a clear winner based on creativity or detail. \\n\\nTherefore, my rating is a tie. \\n\",\n", - " 'value': None,\n", - " 'score': 0.5}" + "{'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference. Although Response B is less detailed, it is accurate and directly answers the question. \\n\\nTherefore, the better response is [[B]].\\n',\n", + " 'value': 'B',\n", + " 'score': 0}" ] }, "execution_count": 5, @@ -66,10 +62,63 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "ed353b93-be71-4479-b9c0-8c97814c2e58", + "metadata": {}, + "source": [ + "## Without References\n", + "\n", + "When references aren't available, you can still predict the preferred response.\n", + "The results will reflect the evaluation model's preference, which is less reliable and may result\n", + "in preferences that are factually incorrect." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "586320da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reasoning': 'Both responses answer the question directly and accurately, but neither provides any additional detail or context. Response A is slightly more complete because it uses a full sentence, while Response B only provides a number. However, both responses are relevant and accurate, so the difference is minimal.\\n\\nFinal decision: [[C]]\\n',\n", + " 'value': None,\n", + " 'score': 0.5}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_chain.evaluate_string_pairs(\n", + " prediction = \"there are three dogs\",\n", + " prediction_b=\"4\",\n", + " input=\"What is the name of the dog?\",\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "586320da", + "id": "de84a958-1330-482b-b950-68bcf23f9e35", "metadata": {}, "outputs": [], "source": [] @@ -91,7 +140,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.2" } }, "nbformat": 4, diff --git a/docs/extras/modules/evaluation/comparisons.ipynb b/docs/extras/modules/evaluation/comparisons.ipynb deleted file mode 100644 index 9fe3fab8399..00000000000 --- a/docs/extras/modules/evaluation/comparisons.ipynb +++ /dev/null @@ -1,448 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Comparing Chain Outputs\n", - "\n", - "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n", - "\n", - "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`[[1]](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n", - "\n", - "For this evalution, we will need 3 things:\n", - "1. An evaluator\n", - "2. A dataset of inputs\n", - "3. 2 (or more) LLMs, Chains, or Agents to compare\n", - "\n", - "Then we will aggregate the restults to determine the preferred model.\n", - "\n", - "### Step 1. Create the Evaluator\n", - "\n", - "In this example, you will use gpt-4 to select which output is preferred." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Optional if you are tracing the notebook\n", - "%env LANGCHAIN_PROJECT=\"Comparing Chain Outputs\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.evaluation.comparison import PairwiseStringEvalChain\n", - "\n", - "llm = ChatOpenAI(model=\"gpt-4\")\n", - "\n", - "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2. Select Dataset\n", - "\n", - "If you already have real usage data for your LLM, you can use a representative sample. More examples\n", - "provide more reliable results. We will use some example queries someone might have about how to use langchain here." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d852a1884480457292c90d8bd9d4f1e6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\" \n", - "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n", - "\n", - "# Initialize the SerpAPIWrapper for search functionality\n", - "#Replace in openai_api_key=\"\" with your actual SerpAPI key.\n", - "search = SerpAPIWrapper()\n", - "\n", - "# Define a list of tools offered by the agent\n", - "tools = [\n", - " Tool(\n", - " name=\"Search\",\n", - " func=search.run,\n", - " coroutine=search.arun,\n", - " description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\"\n", - " ),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "functions_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False)\n", - "conversations_agent = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 4. Generate Responses\n", - "\n", - "We will generate outputs for each of the models before evaluating them." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b076d6bf6680422aa9082d4bad4d98a3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/20 [00:00._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n", - "Retrying langchain.chat_models.openai.acompletion_with_retry.._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n" - ] - } - ], - "source": [ - "from tqdm.notebook import tqdm\n", - "import asyncio\n", - "\n", - "results = []\n", - "agents = [functions_agent, conversations_agent]\n", - "concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n", - "\n", - "# We will only run the first 20 examples of this dataset to speed things up\n", - "# This will lead to larger confidence intervals downstream.\n", - "batch = []\n", - "for example in tqdm(dataset[:20]):\n", - " batch.extend([agent.acall(example['inputs']) for agent in agents])\n", - " if len(batch) >= concurrency_level:\n", - " batch_results = await asyncio.gather(*batch, return_exceptions=True)\n", - " results.extend(list(zip(*[iter(batch_results)]*2)))\n", - " batch = []\n", - "if batch:\n", - " batch_results = await asyncio.gather(*batch, return_exceptions=True)\n", - " results.extend(list(zip(*[iter(batch_results)]*2)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 5. Evaluate Pairs\n", - "\n", - "Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n", - "\n", - "Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import random\n", - "\n", - "def predict_preferences(dataset, results) -> list:\n", - " preferences = []\n", - "\n", - " for example, (res_a, res_b) in zip(dataset, results):\n", - " input_ = example['inputs']\n", - " # Flip a coin to reduce persistent position bias\n", - " if random.random() < 0.5:\n", - " pred_a, pred_b = res_a, res_b\n", - " a, b = \"a\", \"b\"\n", - " else:\n", - " pred_a, pred_b = res_b, res_a\n", - " a, b = \"b\", \"a\"\n", - " eval_res = eval_chain.evaluate_string_pairs(\n", - " prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n", - " prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n", - " input=input_\n", - " )\n", - " if eval_res[\"value\"] == \"A\":\n", - " preferences.append(a)\n", - " elif eval_res[\"value\"] == \"B\":\n", - " preferences.append(b)\n", - " else:\n", - " preferences.append(None) # No preference\n", - " return preferences" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "preferences = predict_preferences(dataset, results)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "**Print out the ratio of preferences.**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OpenAI Functions Agent: 90.00%\n", - "Structured Chat Agent: 10.00%\n" - ] - } - ], - "source": [ - "from collections import Counter\n", - "\n", - "name_map = {\n", - " \"a\": \"OpenAI Functions Agent\",\n", - " \"b\": \"Structured Chat Agent\",\n", - "}\n", - "counts = Counter(preferences)\n", - "pref_ratios = {\n", - " k: v/len(preferences) for k, v in\n", - " counts.items()\n", - "}\n", - "for k, v in pref_ratios.items():\n", - " print(f\"{name_map.get(k)}: {v:.2%}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate Confidence Intervals\n", - "\n", - "The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n", - "\n", - "Below, use the Wilson score to estimate the confidence interval." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from math import sqrt\n", - "\n", - "def wilson_score_interval(preferences: list, which: str = \"a\", z: float = 1.96) -> tuple:\n", - " \"\"\"Estimate the confidence interval using the Wilson score.\n", - " \n", - " See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n", - " for more details, including when to use it and when it should not be used.\n", - " \"\"\"\n", - " total_preferences = preferences.count('a') + preferences.count('b')\n", - " n_s = preferences.count(which)\n", - "\n", - " if total_preferences == 0:\n", - " return (0, 0)\n", - "\n", - " p_hat = n_s / total_preferences\n", - "\n", - " denominator = 1 + (z**2) / total_preferences\n", - " adjustment = (z / denominator) * sqrt(p_hat*(1-p_hat)/total_preferences + (z**2)/(4*total_preferences*total_preferences))\n", - " center = (p_hat + (z**2) / (2*total_preferences)) / denominator\n", - " lower_bound = min(max(center - adjustment, 0.0), 1.0)\n", - " upper_bound = min(max(center + adjustment, 0.0), 1.0)\n", - "\n", - " return (lower_bound, upper_bound)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The \"OpenAI Functions Agent\" would be preferred between 69.90% and 97.21% percent of the time (with 95% confidence).\n", - "The \"Structured Chat Agent\" would be preferred between 2.79% and 30.10% percent of the time (with 95% confidence).\n" - ] - } - ], - "source": [ - "for which_, name in name_map.items():\n", - " low, high = wilson_score_interval(preferences, which=which_)\n", - " print(f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Print out the p-value.**" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The p-value is 0.00040. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n", - "then there is a 0.04025% chance of observing the OpenAI Functions Agent be preferred at least 18\n", - "times out of 20 trials.\n" - ] - } - ], - "source": [ - "from scipy import stats\n", - "preferred_model = max(pref_ratios, key=pref_ratios.get)\n", - "successes = preferences.count(preferred_model)\n", - "n = len(preferences) - preferences.count(None)\n", - "p_value = stats.binom_test(successes, n, p=0.5, alternative='two-sided')\n", - "print(f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n", - "then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n", - "times out of {n} trials.\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n", - "LLM preferences exhibit biases, including banal ones like the order of outputs.\n", - "In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/extras/modules/evaluation/criteria_eval_chain.ipynb b/docs/extras/modules/evaluation/criteria_eval_chain.ipynb deleted file mode 100644 index c894df3b34a..00000000000 --- a/docs/extras/modules/evaluation/criteria_eval_chain.ipynb +++ /dev/null @@ -1,437 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "4cf569a7-9a1d-4489-934e-50e57760c907", - "metadata": {}, - "source": [ - "# Evaluating Custom Criteria\n", - "\n", - "Suppose you want to test a model's output against a custom rubric or custom set of criteria, how would you go about testing this?\n", - "\n", - "The `CriteriaEvalChain` is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n", - "describe those criteria in regular language. In this example, you will use the `CriteriaEvalChain` to check whether an output is concise.\n", - "\n", - "### Step 1: Create the Eval Chain\n", - "\n", - "First, create the evaluation chain to predict whether outputs are \"concise\"." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6005ebe8-551e-47a5-b4df-80575a068552", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.evaluation.criteria import CriteriaEvalChain\n", - "\n", - "llm = ChatOpenAI(temperature=0)\n", - "criterion = \"conciseness\"\n", - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criterion)" - ] - }, - { - "cell_type": "markdown", - "id": "eaef0d93-e080-4be2-a0f1-701b0d91fcf4", - "metadata": {}, - "source": [ - "### Step 2: Make Prediction\n", - "\n", - "Run an output to measure." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "68b1a348-cf41-40bf-9667-e79683464cf2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "llm = ChatOpenAI(temperature=0)\n", - "query=\"What's the origin of the term synecdoche?\"\n", - "prediction = llm.predict(query)" - ] - }, - { - "cell_type": "markdown", - "id": "f45ed40e-09c4-44dc-813d-63a4ffb2d2ea", - "metadata": {}, - "source": [ - "### Step 3: Evaluate Prediction\n", - "\n", - "Determine whether the prediciton conforms to the criteria." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "22f83fb8-82f4-4310-a877-68aaa0789199", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'reasoning': '1. Conciseness: The submission is concise and to the point. It directly answers the question without any unnecessary information. Therefore, the submission meets the criterion of conciseness.\\n\\nY', 'value': 'Y', 'score': 1}\n" - ] - } - ], - "source": [ - "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n", - "print(eval_result)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['conciseness',\n", - " 'relevance',\n", -<<<<<<< HEAD -======= - " 'correctness',\n", ->>>>>>> 284dc3f3 (mv again) - " 'coherence',\n", - " 'harmfulness',\n", - " 'maliciousness',\n", - " 'helpfulness',\n", - " 'controversiality',\n", - " 'mysogyny',\n", - " 'criminality',\n", - " 'insensitive']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# For a list of other default supported criteria, try calling `supported_default_criteria`\n", - "CriteriaEvalChain.get_supported_default_criteria()" - ] - }, - { - "cell_type": "markdown", -<<<<<<< HEAD - "id": "2eb7dedb-913a-4d9e-b48a-9521425d1008", - "metadata": {}, - "source": [ -======= - "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461", - "metadata": {}, - "source": [ - "## Requiring Reference Labels\n", - "\n", - "Some criteria may be useful only when there are ground truth reference labels. You can pass these in as well." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "With ground truth: 1\n", - "Withoutg ground truth: 0\n" - ] - } - ], - "source": [ - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\", requires_reference=True)\n", - "\n", - "# We can even override the model's learned knowledge using ground truth labels\n", - "eval_result = eval_chain.evaluate_strings(\n", - " input=\"What is the capital of the US?\",\n", - " prediction=\"Topeka, KS\", \n", - " reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\")\n", - "print(f'With ground truth: {eval_result[\"score\"]}')\n", - "\n", - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=\"correctness\")\n", - "eval_result = eval_chain.evaluate_strings(\n", - " input=\"What is the capital of the US?\",\n", - " prediction=\"Topeka, KS\", \n", - ")\n", - "print(f'Withoutg ground truth: {eval_result[\"score\"]}')" - ] - }, - { - "cell_type": "markdown", - "id": "2eb7dedb-913a-4d9e-b48a-9521425d1008", - "metadata": { - "tags": [] - }, - "source": [ ->>>>>>> 284dc3f3 (mv again) - "## Multiple Criteria\n", - "\n", - "To check whether an output complies with all of a list of default criteria, pass in a list! Be sure to only include criteria that are relevant to the provided information, and avoid mixing criteria that measure opposing things (e.g., harmfulness and helpfulness)" - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 9, -======= - "execution_count": 6, ->>>>>>> 284dc3f3 (mv again) - "id": "50c067f7-bc6e-4d6c-ba34-97a72023be27", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ -<<<<<<< HEAD - "{'reasoning': 'Conciseness: The submission is not concise and does not answer the given task. It provides information on the origin of the term synecdoche, which is not relevant to the task. Therefore, the submission does not meet the criterion of conciseness.\\n\\nCoherence: The submission is not coherent, well-structured, or organized. It does not provide any information related to the given task and is not connected to the topic in any way. Therefore, the submission does not meet the criterion of coherence.\\n\\nConclusion: The submission does not meet all criteria.', 'value': 'N', 'score': 0}\n" -======= - "{'reasoning': 'Conciseness:\\n- The submission is one sentence long, which is concise.\\n- The submission directly answers the question without any unnecessary information.\\nConclusion: The submission meets the conciseness criterion.\\n\\nCoherence:\\n- The submission is well-structured and organized.\\n- The submission provides the origin of the term synecdoche and explains the meaning of the Greek words it comes from.\\n- The submission is coherent and easy to understand.\\nConclusion: The submission meets the coherence criterion.', 'value': 'Final conclusion: Y', 'score': None}\n" ->>>>>>> 284dc3f3 (mv again) - ] - } - ], - "source": [ - "criteria = [\"conciseness\", \"coherence\"]\n", - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)\n", - "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n", - "print(eval_result)" - ] - }, - { - "cell_type": "markdown", - "id": "077c4715-e857-44a3-9f87-346642586a8d", - "metadata": {}, - "source": [ - "## Custom Criteria\n", - "\n", - "To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `\"criterion_name\": \"criterion_description\"`\n", - "\n", - "Note: the evaluator still predicts whether the output complies with ALL of the criteria provided. If you specify antagonistic criteria / antonyms, the evaluator won't be very useful." - ] - }, - { - "cell_type": "code", -<<<<<<< HEAD - "execution_count": 6, -======= - "execution_count": 7, ->>>>>>> 284dc3f3 (mv again) - "id": "bafa0a11-2617-4663-84bf-24df7d0736be", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'reasoning': '1. Criteria: numeric: Does the output contain numeric information?\\n- The submission does not contain any numeric information.\\n- Conclusion: The submission meets the criteria.', 'value': 'Answer: Y', 'score': None}\n" - ] - } - ], - "source": [ - "custom_criterion = {\n", - " \"numeric\": \"Does the output contain numeric information?\"\n", - "}\n", - "\n", - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criterion)\n", - "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n", - "print(eval_result)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6db12a16-0058-4a14-8064-8528540963d8", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ -<<<<<<< HEAD - "{'reasoning': '- complements-user: The submission directly answers the question asked and provides additional information about the population of Lagos. However, it does not necessarily complement the person writing the question. \\n- positive: The submission maintains a positive tone throughout and does not contain any negative language. \\n- active voice: The submission uses an active voice and avoids state of being verbs. \\n\\nTherefore, the submission meets all criteria. \\n\\nY\\n\\nY', 'value': 'Y', 'score': 1}\n", - "Meets criteria: 1\n", - "{'reasoning': '- complements-user: The submission directly answers the question asked in the task, so it complements the question. Therefore, the answer meets this criterion. \\n- positive: The submission does not contain any negative language or tone, so it maintains a positive sentiment throughout. Therefore, the answer meets this criterion. \\n- active voice: The submission uses the state of being verb \"is\" to describe the population, which is not in active voice. Therefore, the answer does not meet this criterion. \\n\\nAnswer: N', 'value': 'N', 'score': 0}\n", -======= - "Meets criteria: 1\n", ->>>>>>> 284dc3f3 (mv again) - "Does not meet criteria: 0\n" - ] - } - ], - "source": [ - "# You can specify multiple criteria in the dictionary. We recommend you keep the number criteria to a minimum, however for more reliable results.\n", - "\n", - "custom_criteria = {\n", - " \"complements-user\": \"Does the submission complements the question or the person writing the question in some way?\",\n", - " \"positive\": \"Does the submission maintain a positive sentiment throughout?\",\n", - " \"active voice\": \"Does the submission maintain an active voice throughout, avoiding state of being verbs?\",\n", - "}\n", - "\n", - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=custom_criteria)\n", - "\n", - "# Example that complies\n", - "query = \"What's the population of lagos?\"\n", - "eval_result = eval_chain.evaluate_strings(prediction=\"I think that's a great question, you're really curious! About 30 million people live in Lagos, Nigeria, as of 2023.\", input=query)\n", - "print(\"Meets criteria: \", eval_result[\"score\"])\n", - "\n", - "# Example that does not comply\n", - "eval_result = eval_chain.evaluate_strings(prediction=\"The population of Lagos, Nigeria, is about 30 million people.\", input=query)\n", - "print(\"Does not meet criteria: \", eval_result[\"score\"])" - ] - }, - { -<<<<<<< HEAD - "cell_type": "code", - "execution_count": null, - "id": "99e3c242-5b12-4bd5-b487-64990a159655", -======= - "cell_type": "markdown", - "id": "07485cce-8d52-43a0-bdad-76ec7dacfb51", - "metadata": {}, - "source": [ - "## Using Constitutional Principles\n", - "\n", - "Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to\n", - "instantiate the chain and take advantage of the many existing principles in LangChain." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "99e3c242-5b12-4bd5-b487-64990a159655", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "54 available principles\n" - ] - }, - { - "data": { - "text/plain": [ - "[('harmful1',\n", - " ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),\n", - " ('harmful2',\n", - " ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),\n", - " ('harmful3',\n", - " ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),\n", - " ('harmful4',\n", - " ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),\n", - " ('insensitive',\n", - " ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain.chains.constitutional_ai.principles import PRINCIPLES\n", - "\n", - "print(f\"{len(PRINCIPLES)} available principles\")\n", - "list(PRINCIPLES.items())[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'reasoning': 'harmful1: The submission includes a derogatory and insulting term (\"lilly-livered nincompoop\") to describe Will, which could be considered harmful and disrespectful. Therefore, the submission meets this criterion.\\nharmful2: The submission does not contain any explicit encouragement of dangerous or illegal activity, nor does it contain any explicit racism or sexism. However, the use of the term \"nincompoop\" could be seen as belittling and harmful to Will\\'s self-esteem. Therefore, the submission meets this criterion as well.\\n\\nY',\n", - " 'value': 'Y',\n", - " 'score': 1}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_chain = CriteriaEvalChain.from_llm(llm=llm, criteria=[PRINCIPLES[\"harmful1\"], PRINCIPLES[\"harmful2\"]])\n", - "eval_result = eval_chain.evaluate_strings(prediction=\"I say that man is a lilly-livered nincompoop\", input=\"What do you think of Will?\")\n", - "eval_result" - ] - }, - { - "cell_type": "markdown", - "id": "f2662405-353a-4a73-b867-784d12cafcf1", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.\n", - "\n", - "Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "415eb393-c64f-41f1-98de-de99e8e3597e", ->>>>>>> 284dc3f3 (mv again) - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", -<<<<<<< HEAD - "version": "3.9.1" -======= - "version": "3.11.3" ->>>>>>> 284dc3f3 (mv again) - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/data_augmented_question_answering.ipynb b/docs/extras/modules/evaluation/data_augmented_question_answering.ipynb deleted file mode 100644 index 48b1e6ab170..00000000000 --- a/docs/extras/modules/evaluation/data_augmented_question_answering.ipynb +++ /dev/null @@ -1,445 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e78b7bb1", - "metadata": {}, - "source": [ - "# Data Augmented Question Answering\n", - "\n", - "This notebook uses some generic prompts/language models to evaluate an question answering system that uses other sources of data besides what is in the model. For example, this can be used to evaluate a question answering system over your proprietary data.\n", - "\n", - "## Setup\n", - "Let's set up an example with our favorite example - the state of the union address." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "ab4a6931", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.vectorstores import Chroma\n", - "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.llms import OpenAI\n", - "from langchain.chains import RetrievalQA" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4fdc211d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Running Chroma using direct local API.\n", - "Using DuckDB in-memory for database. Data will be transient.\n" - ] - } - ], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "\n", - "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", - "documents = loader.load()\n", - "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", - "texts = text_splitter.split_documents(documents)\n", - "\n", - "embeddings = OpenAIEmbeddings()\n", - "docsearch = Chroma.from_documents(texts, embeddings)\n", - "qa = RetrievalQA.from_llm(llm=OpenAI(), retriever=docsearch.as_retriever())" - ] - }, - { - "cell_type": "markdown", - "id": "30fd72f2", - "metadata": {}, - "source": [ - "## Examples\n", - "Now we need some examples to evaluate. We can do this in two ways:\n", - "\n", - "1. Hard code some examples ourselves\n", - "2. Generate examples automatically, using a language model" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "3459b001", - "metadata": {}, - "outputs": [], - "source": [ - "# Hard-coded examples\n", - "examples = [\n", - " {\n", - " \"query\": \"What did the president say about Ketanji Brown Jackson\",\n", - " \"answer\": \"He praised her legal ability and said he nominated her for the supreme court.\",\n", - " },\n", - " {\"query\": \"What did the president say about Michael Jackson\", \"answer\": \"Nothing\"},\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b9c3fa75", - "metadata": {}, - "outputs": [], - "source": [ - "# Generated examples\n", - "from langchain.evaluation.qa import QAGenerateChain\n", - "\n", - "example_gen_chain = QAGenerateChain.from_llm(OpenAI())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c24543a9", - "metadata": {}, - "outputs": [], - "source": [ - "new_examples = example_gen_chain.apply_and_parse([{\"doc\": t} for t in texts[:5]])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a2d27560", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'query': 'According to the document, what did Vladimir Putin miscalculate?',\n", - " 'answer': 'He miscalculated that he could roll into Ukraine and the world would roll over.'},\n", - " {'query': 'Who is the Ukrainian Ambassador to the United States?',\n", - " 'answer': 'The Ukrainian Ambassador to the United States is here tonight.'},\n", - " {'query': 'How many countries were part of the coalition formed to confront Putin?',\n", - " 'answer': '27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.'},\n", - " {'query': 'What action is the U.S. Department of Justice taking to target Russian oligarchs?',\n", - " 'answer': 'The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.'},\n", - " {'query': 'How much direct assistance is the United States providing to Ukraine?',\n", - " 'answer': 'The United States is providing more than $1 Billion in direct assistance to Ukraine.'}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_examples" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "558da6f3", - "metadata": {}, - "outputs": [], - "source": [ - "# Combine examples\n", - "examples += new_examples" - ] - }, - { - "cell_type": "markdown", - "id": "443dc34e", - "metadata": {}, - "source": [ - "## Evaluate\n", - "Now that we have examples, we can use the question answering evaluator to evaluate our question answering chain." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "782169a5", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.evaluation.qa import QAEvalChain" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "1bb77416", - "metadata": {}, - "outputs": [], - "source": [ - "predictions = qa.apply(examples)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "bcd0ad7f", - "metadata": {}, - "outputs": [], - "source": [ - "llm = OpenAI(temperature=0)\n", - "eval_chain = QAEvalChain.from_llm(llm)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2e6af79a", - "metadata": {}, - "outputs": [], - "source": [ - "graded_outputs = eval_chain.evaluate(examples, predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "32fac2dc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Example 0:\n", - "Question: What did the president say about Ketanji Brown Jackson\n", - "Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n", - "Predicted Answer: The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n", - "Predicted Grade: CORRECT\n", - "\n", - "Example 1:\n", - "Question: What did the president say about Michael Jackson\n", - "Real Answer: Nothing\n", - "Predicted Answer: The president did not mention Michael Jackson in this speech.\n", - "Predicted Grade: CORRECT\n", - "\n", - "Example 2:\n", - "Question: According to the document, what did Vladimir Putin miscalculate?\n", - "Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n", - "Predicted Answer: Putin miscalculated that the world would roll over when he rolled into Ukraine.\n", - "Predicted Grade: CORRECT\n", - "\n", - "Example 3:\n", - "Question: Who is the Ukrainian Ambassador to the United States?\n", - "Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n", - "Predicted Answer: I don't know.\n", - "Predicted Grade: INCORRECT\n", - "\n", - "Example 4:\n", - "Question: How many countries were part of the coalition formed to confront Putin?\n", - "Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n", - "Predicted Answer: The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n", - "Predicted Grade: INCORRECT\n", - "\n", - "Example 5:\n", - "Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n", - "Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n", - "Predicted Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n", - "Predicted Grade: INCORRECT\n", - "\n", - "Example 6:\n", - "Question: How much direct assistance is the United States providing to Ukraine?\n", - "Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n", - "Predicted Answer: The United States is providing more than $1 billion in direct assistance to Ukraine.\n", - "Predicted Grade: CORRECT\n", - "\n" - ] - } - ], - "source": [ - "for i, eg in enumerate(examples):\n", - " print(f\"Example {i}:\")\n", - " print(\"Question: \" + predictions[i][\"query\"])\n", - " print(\"Real Answer: \" + predictions[i][\"answer\"])\n", - " print(\"Predicted Answer: \" + predictions[i][\"result\"])\n", - " print(\"Predicted Grade: \" + graded_outputs[i][\"text\"])\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "id": "50a9e845", - "metadata": {}, - "source": [ - "## Evaluate with Other Metrics\n", - "\n", - "In addition to predicting whether the answer is correct or incorrect using a language model, we can also use other metrics to get a more nuanced view on the quality of the answers. To do so, we can use the [Critique](https://docs.inspiredco.ai/critique/) library, which allows for simple calculation of various metrics over generated text.\n", - "\n", - "First you can get an API key from the [Inspired Cognition Dashboard](https://dashboard.inspiredco.ai) and do some setup:\n", - "\n", - "```bash\n", - "export INSPIREDCO_API_KEY=\"...\"\n", - "pip install inspiredco\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "bd0b01dc", - "metadata": {}, - "outputs": [], - "source": [ - "import inspiredco.critique\n", - "import os\n", - "\n", - "critique = inspiredco.critique.Critique(api_key=os.environ[\"INSPIREDCO_API_KEY\"])" - ] - }, - { - "cell_type": "markdown", - "id": "4f52629e", - "metadata": {}, - "source": [ - "Then run the following code to set up the configuration and calculate the [ROUGE](https://docs.inspiredco.ai/critique/metric_rouge.html), [chrf](https://docs.inspiredco.ai/critique/metric_chrf.html), [BERTScore](https://docs.inspiredco.ai/critique/metric_bert_score.html), and [UniEval](https://docs.inspiredco.ai/critique/metric_uni_eval.html) (you can choose [other metrics](https://docs.inspiredco.ai/critique/metrics.html) too):" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "84a0ba21", - "metadata": {}, - "outputs": [], - "source": [ - "metrics = {\n", - " \"rouge\": {\n", - " \"metric\": \"rouge\",\n", - " \"config\": {\"variety\": \"rouge_l\"},\n", - " },\n", - " \"chrf\": {\n", - " \"metric\": \"chrf\",\n", - " \"config\": {},\n", - " },\n", - " \"bert_score\": {\n", - " \"metric\": \"bert_score\",\n", - " \"config\": {\"model\": \"bert-base-uncased\"},\n", - " },\n", - " \"uni_eval\": {\n", - " \"metric\": \"uni_eval\",\n", - " \"config\": {\"task\": \"summarization\", \"evaluation_aspect\": \"relevance\"},\n", - " },\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "3b9a4056", - "metadata": {}, - "outputs": [], - "source": [ - "critique_data = [\n", - " {\"target\": pred[\"result\"], \"references\": [pred[\"answer\"]]} for pred in predictions\n", - "]\n", - "eval_results = {\n", - " k: critique.evaluate(dataset=critique_data, metric=v[\"metric\"], config=v[\"config\"])\n", - " for k, v in metrics.items()\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "6f0ae799", - "metadata": {}, - "source": [ - "Finally, we can print out the results. We can see that overall the scores are higher when the output is semantically correct, and also when the output closely matches with the gold-standard answer." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "b51edcf4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Example 0:\n", - "Question: What did the president say about Ketanji Brown Jackson\n", - "Real Answer: He praised her legal ability and said he nominated her for the supreme court.\n", - "Predicted Answer: The president said that she is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans.\n", - "Predicted Scores: rouge=0.0941, chrf=0.2001, bert_score=0.5219, uni_eval=0.9043\n", - "\n", - "Example 1:\n", - "Question: What did the president say about Michael Jackson\n", - "Real Answer: Nothing\n", - "Predicted Answer: The president did not mention Michael Jackson in this speech.\n", - "Predicted Scores: rouge=0.0000, chrf=0.1087, bert_score=0.3486, uni_eval=0.7802\n", - "\n", - "Example 2:\n", - "Question: According to the document, what did Vladimir Putin miscalculate?\n", - "Real Answer: He miscalculated that he could roll into Ukraine and the world would roll over.\n", - "Predicted Answer: Putin miscalculated that the world would roll over when he rolled into Ukraine.\n", - "Predicted Scores: rouge=0.5185, chrf=0.6955, bert_score=0.8421, uni_eval=0.9578\n", - "\n", - "Example 3:\n", - "Question: Who is the Ukrainian Ambassador to the United States?\n", - "Real Answer: The Ukrainian Ambassador to the United States is here tonight.\n", - "Predicted Answer: I don't know.\n", - "Predicted Scores: rouge=0.0000, chrf=0.0375, bert_score=0.3159, uni_eval=0.7493\n", - "\n", - "Example 4:\n", - "Question: How many countries were part of the coalition formed to confront Putin?\n", - "Real Answer: 27 members of the European Union, France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n", - "Predicted Answer: The coalition included freedom-loving nations from Europe and the Americas to Asia and Africa, 27 members of the European Union including France, Germany, Italy, the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n", - "Predicted Scores: rouge=0.7419, chrf=0.8602, bert_score=0.8388, uni_eval=0.0669\n", - "\n", - "Example 5:\n", - "Question: What action is the U.S. Department of Justice taking to target Russian oligarchs?\n", - "Real Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and joining with European allies to find and seize their yachts, luxury apartments, and private jets.\n", - "Predicted Answer: The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs and to find and seize their yachts, luxury apartments, and private jets.\n", - "Predicted Scores: rouge=0.9412, chrf=0.8687, bert_score=0.9607, uni_eval=0.9718\n", - "\n", - "Example 6:\n", - "Question: How much direct assistance is the United States providing to Ukraine?\n", - "Real Answer: The United States is providing more than $1 Billion in direct assistance to Ukraine.\n", - "Predicted Answer: The United States is providing more than $1 billion in direct assistance to Ukraine.\n", - "Predicted Scores: rouge=1.0000, chrf=0.9483, bert_score=1.0000, uni_eval=0.9734\n", - "\n" - ] - } - ], - "source": [ - "for i, eg in enumerate(examples):\n", - " score_string = \", \".join(\n", - " [f\"{k}={v['examples'][i]['value']:.4f}\" for k, v in eval_results.items()]\n", - " )\n", - " print(f\"Example {i}:\")\n", - " print(\"Question: \" + predictions[i][\"query\"])\n", - " print(\"Real Answer: \" + predictions[i][\"answer\"])\n", - " print(\"Predicted Answer: \" + predictions[i][\"result\"])\n", - " print(\"Predicted Scores: \" + score_string)\n", - " print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/how_to/regression_testing.ipynb b/docs/extras/modules/evaluation/how_to/regression_testing.ipynb new file mode 100644 index 00000000000..e606e6fe453 --- /dev/null +++ b/docs/extras/modules/evaluation/how_to/regression_testing.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0fedc3eb-58d3-4001-9d52-699905aed710", + "metadata": { + "tags": [] + }, + "source": [ + "# Regression Testing\n", + "\n", + "When dealing with model API's, it can be hard to know if the prediction quality has changed without proper regression testing. This guide will touch on three easy ways\n", + "to regression test your model API's. We will use a QA system as an example. They all depend on constructing a dataset of inputs. It's best for inputs to be representative of your application domain.\n", + "\n", + "**Important:** As with any system, it's important to isolate what you want to test. If you are regression testing an LLM API, test it directly or mock other components of your application." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c66c2025-8569-4955-a50a-bb66bd39413e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.evaluation.loading import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8095377-7751-4d1b-8303-051a48adc6c7", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = []" + ] + }, + { + "cell_type": "markdown", + "id": "b690d689-b338-4d74-8dbc-9debaaa6725d", + "metadata": {}, + "source": [ + "\n", + "## Approach 1: Compare Aggregate Performance\n", + "\n", + "The first approach is to construct an example dataset with reference examples. You can test the accuracy (or other metrics) of your model on a schedule to ensure the accuracy of your model is not degrading." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ee582f1-de66-4544-99ef-3bf672c13a05", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0631\", temperature=0)\n", + "# TODO" + ] + }, + { + "cell_type": "markdown", + "id": "7562c310-d80b-4461-96e0-d70bc94b3e9a", + "metadata": {}, + "source": [ + "## Approach 2: Pairwise Compare Outputs\n", + "\n", + "The second way you can track changes and regressions is to compare outputs of the model on identical inputs. You can use a simple exact (or fuzzy) string match metric\n", + "or use a model graded metric to ensure the meanings of the outputs are the same.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f47bdef5-7202-4523-b207-c0b6a7dd6da5", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/evaluation/huggingface_datasets.ipynb b/docs/extras/modules/evaluation/huggingface_datasets.ipynb deleted file mode 100644 index 510cc379a8d..00000000000 --- a/docs/extras/modules/evaluation/huggingface_datasets.ipynb +++ /dev/null @@ -1,287 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3cadcf88", - "metadata": {}, - "source": [ - "# Using Hugging Face Datasets\n", - "\n", - "This example shows how to use Hugging Face datasets to evaluate models. Specifically, we show how to load examples to evaluate models on from Hugging Face's dataset package." - ] - }, - { - "cell_type": "markdown", - "id": "0e3ce977", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "For demonstration purposes, we will just evaluate a simple question answering system." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "4c10054f", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts import PromptTemplate\n", - "from langchain.chains import LLMChain\n", - "from langchain.llms import OpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9abdf160", - "metadata": {}, - "outputs": [], - "source": [ - "prompt = PromptTemplate(\n", - " template=\"Question: {question}\\nAnswer:\", input_variables=[\"question\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d41ef7bb", - "metadata": {}, - "outputs": [], - "source": [ - "llm = OpenAI(model_name=\"text-davinci-003\", temperature=0)\n", - "chain = LLMChain(llm=llm, prompt=prompt)" - ] - }, - { - "cell_type": "markdown", - "id": "cbea2132", - "metadata": {}, - "source": [ - "## Examples\n", - "\n", - "Now we load a dataset from Hugging Face, and then convert it to a list of dictionaries for easier usage." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d2373cf1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset truthful_qa (/Users/harrisonchase/.cache/huggingface/datasets/truthful_qa/generation/1.1.0/70210b72382652635215516e59663843b88eda16bd2acef909fb46700beb039a)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "92216d733c694ab4bfa812614f2223a4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 Question: {question}\n", - "\n", - "The query you know you should be executing against the API is:\n", - "\n", - "> Query: {truth_query}\n", - "\n", - "Is the following predicted query semantically the same (eg likely to produce the same answer)?\n", - "\n", - "> Predicted Query: {predict_query}\n", - "\n", - "Please give the Predicted Query a grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: '\n", - "\n", - "> Explanation: Let's think step by step.\"\"\"\n", - "\n", - "prompt = PromptTemplate.from_template(template)\n", - "\n", - "eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "8cc1b1db", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n", - " ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n", - " \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n", - " ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n", - " ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n", - " \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n", - " ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n", - " ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n", - " ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n", - " ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "request_eval_results = []\n", - "for question, predict_query, truth_query in list(\n", - " zip(questions, predicted_queries, truth_queries)\n", - "):\n", - " eval_output = eval_chain.run(\n", - " question=question,\n", - " truth_query=truth_query,\n", - " predict_query=predict_query,\n", - " )\n", - " request_eval_results.append(eval_output)\n", - "request_eval_results" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "0d76f8ba", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "from typing import List\n", - "\n", - "\n", - "# Parse the evaluation chain responses into a rubric\n", - "def parse_eval_results(results: List[str]) -> List[float]:\n", - " rubric = {\"A\": 1.0, \"B\": 0.75, \"C\": 0.5, \"D\": 0.25, \"F\": 0}\n", - " return [rubric[re.search(r\"Final Grade: (\\w+)\", res).group(1)] for res in results]\n", - "\n", - "\n", - "parsed_results = parse_eval_results(request_eval_results)\n", - "# Collect the scores for a final evaluation table\n", - "scores[\"request_synthesizer\"].extend(parsed_results)" - ] - }, - { - "cell_type": "markdown", - "id": "6f3ee8ea", - "metadata": {}, - "source": [ - "## Evaluate the Response Chain\n", - "\n", - "The second component translated the structured API response to a natural language response.\n", - "Evaluate this against the user's original question." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "8b97847c", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts import PromptTemplate\n", - "\n", - "template = \"\"\"You are trying to answer the following question by querying an API:\n", - "\n", - "> Question: {question}\n", - "\n", - "The API returned a response of:\n", - "\n", - "> API result: {api_response}\n", - "\n", - "Your response to the user: {answer}\n", - "\n", - "Please evaluate the accuracy and utility of your response to the user's original question, conditioned on the information available.\n", - "Give a letter grade of either an A, B, C, D, or F, along with an explanation of why. End the evaluation with 'Final Grade: '\n", - "\n", - "> Explanation: Let's think step by step.\"\"\"\n", - "\n", - "prompt = PromptTemplate.from_template(template)\n", - "\n", - "eval_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "642852ce", - "metadata": {}, - "outputs": [], - "source": [ - "# Extract the API responses from the chain\n", - "api_responses = [\n", - " output[\"intermediate_steps\"][\"response_text\"] for output in chain_outputs\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "08a5eb4f", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[' The original query is asking for all iPhone models, so the \"q\" parameter is correct. The \"max_price\" parameter is also correct, as it is set to null, meaning that no maximum price is set. The predicted query adds two additional parameters, \"size\" and \"min_price\". The \"size\" parameter is not necessary, as it is not relevant to the question being asked. The \"min_price\" parameter is also not necessary, as it is not relevant to the question being asked and it is set to 0, which is the default value. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n", - " ' The original query is asking for laptops with a maximum price of 300. The predicted query is asking for laptops with a minimum price of 0 and a maximum price of 500. This means that the predicted query is likely to return more results than the original query, as it is asking for a wider range of prices. Therefore, the predicted query is not semantically the same as the original query, and it is not likely to produce the same answer. Final Grade: F',\n", - " \" The first two parameters are the same, so that's good. The third parameter is different, but it's not necessary for the query, so that's not a problem. The fourth parameter is the problem. The original query specifies a maximum price of 500, while the predicted query specifies a maximum price of null. This means that the predicted query will not limit the results to the cheapest gaming PCs, so it is not semantically the same as the original query. Final Grade: F\",\n", - " ' The original query is asking for tablets under $400, so the first two parameters are correct. The predicted query also includes the parameters \"size\" and \"min_price\", which are not necessary for the original query. The \"size\" parameter is not relevant to the question, and the \"min_price\" parameter is redundant since the original query already specifies a maximum price. Therefore, the predicted query is not semantically the same as the original query and is not likely to produce the same answer. Final Grade: D',\n", - " ' The original query is asking for headphones with no maximum price, so the predicted query is not semantically the same because it has a maximum price of 500. The predicted query also has a size of 10, which is not specified in the original query. Therefore, the predicted query is not semantically the same as the original query. Final Grade: F',\n", - " \" The original query is asking for the top rated laptops, so the 'size' parameter should be set to 10 to get the top 10 results. The 'min_price' parameter should be set to 0 to get results from all price ranges. The 'max_price' parameter should be set to null to get results from all price ranges. The 'q' parameter should be set to 'laptop' to get results related to laptops. All of these parameters are present in the predicted query, so it is semantically the same as the original query. Final Grade: A\",\n", - " ' The original query is asking for shoes, so the predicted query is asking for the same thing. The original query does not specify a size, so the predicted query is not adding any additional information. The original query does not specify a price range, so the predicted query is adding additional information that is not necessary. Therefore, the predicted query is not semantically the same as the original query and is likely to produce different results. Final Grade: D',\n", - " ' The original query is asking for a skirt, so the predicted query is asking for the same thing. The predicted query also adds additional parameters such as size and price range, which could help narrow down the results. However, the size parameter is not necessary for the query to be successful, and the price range is too narrow. Therefore, the predicted query is not as effective as the original query. Final Grade: C',\n", - " ' The first part of the query is asking for a Desktop PC, which is the same as the original query. The second part of the query is asking for a size of 10, which is not relevant to the original query. The third part of the query is asking for a minimum price of 0, which is not relevant to the original query. The fourth part of the query is asking for a maximum price of null, which is not relevant to the original query. Therefore, the Predicted Query does not semantically match the original query and is not likely to produce the same answer. Final Grade: F',\n", - " ' The original query is asking for cameras with a maximum price of 300. The predicted query is asking for cameras with a maximum price of 500. This means that the predicted query is likely to return more results than the original query, which may include cameras that are not within the budget range. Therefore, the predicted query is not semantically the same as the original query and does not answer the original question. Final Grade: F',\n", - " ' The user asked a question about what iPhone models are available, and the API returned a response with 10 different models. The response provided by the user accurately listed all 10 models, so the accuracy of the response is A+. The utility of the response is also A+ since the user was able to get the exact information they were looking for. Final Grade: A+',\n", - " \" The API response provided a list of laptops with their prices and attributes. The user asked if there were any budget laptops, and the response provided a list of laptops that are all priced under $500. Therefore, the response was accurate and useful in answering the user's question. Final Grade: A\",\n", - " \" The API response provided the name, price, and URL of the product, which is exactly what the user asked for. The response also provided additional information about the product's attributes, which is useful for the user to make an informed decision. Therefore, the response is accurate and useful. Final Grade: A\",\n", - " \" The API response provided a list of tablets that are under $400. The response accurately answered the user's question. Additionally, the response provided useful information such as the product name, price, and attributes. Therefore, the response was accurate and useful. Final Grade: A\",\n", - " \" The API response provided a list of headphones with their respective prices and attributes. The user asked for the best headphones, so the response should include the best headphones based on the criteria provided. The response provided a list of headphones that are all from the same brand (Apple) and all have the same type of headphone (True Wireless, In-Ear). This does not provide the user with enough information to make an informed decision about which headphones are the best. Therefore, the response does not accurately answer the user's question. Final Grade: F\",\n", - " ' The API response provided a list of laptops with their attributes, which is exactly what the user asked for. The response provided a comprehensive list of the top rated laptops, which is what the user was looking for. The response was accurate and useful, providing the user with the information they needed. Final Grade: A',\n", - " ' The API response provided a list of shoes from both Adidas and Nike, which is exactly what the user asked for. The response also included the product name, price, and attributes for each shoe, which is useful information for the user to make an informed decision. The response also included links to the products, which is helpful for the user to purchase the shoes. Therefore, the response was accurate and useful. Final Grade: A',\n", - " \" The API response provided a list of skirts that could potentially meet the user's needs. The response also included the name, price, and attributes of each skirt. This is a great start, as it provides the user with a variety of options to choose from. However, the response does not provide any images of the skirts, which would have been helpful for the user to make a decision. Additionally, the response does not provide any information about the availability of the skirts, which could be important for the user. \\n\\nFinal Grade: B\",\n", - " ' The user asked for a professional desktop PC with no budget constraints. The API response provided a list of products that fit the criteria, including the Skytech Archangel Gaming Computer PC Desktop, the CyberPowerPC Gamer Master Gaming Desktop, and the ASUS ROG Strix G10DK-RS756. The response accurately suggested these three products as they all offer powerful processors and plenty of RAM. Therefore, the response is accurate and useful. Final Grade: A',\n", - " \" The API response provided a list of cameras with their prices, which is exactly what the user asked for. The response also included additional information such as features and memory cards, which is not necessary for the user's question but could be useful for further research. The response was accurate and provided the user with the information they needed. Final Grade: A\"]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Run the grader chain\n", - "response_eval_results = []\n", - "for question, api_response, answer in list(zip(questions, api_responses, answers)):\n", - " request_eval_results.append(\n", - " eval_chain.run(question=question, api_response=api_response, answer=answer)\n", - " )\n", - "request_eval_results" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a144aa9d", - "metadata": {}, - "outputs": [], - "source": [ - "# Reusing the rubric from above, parse the evaluation chain responses\n", - "parsed_response_results = parse_eval_results(request_eval_results)\n", - "# Collect the scores for a final evaluation table\n", - "scores[\"result_synthesizer\"].extend(parsed_response_results)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e95042bc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metric \tMin \tMean \tMax \n", - "completed \t1.00 \t1.00 \t1.00 \n", - "request_synthesizer \t0.00 \t0.23 \t1.00 \n", - "result_synthesizer \t0.00 \t0.55 \t1.00 \n" - ] - } - ], - "source": [ - "# Print out Score statistics for the evaluation session\n", - "header = \"{:<20}\\t{:<10}\\t{:<10}\\t{:<10}\".format(\"Metric\", \"Min\", \"Mean\", \"Max\")\n", - "print(header)\n", - "for metric, metric_scores in scores.items():\n", - " mean_scores = (\n", - " sum(metric_scores) / len(metric_scores)\n", - " if len(metric_scores) > 0\n", - " else float(\"nan\")\n", - " )\n", - " row = \"{:<20}\\t{:<10.2f}\\t{:<10.2f}\\t{:<10.2f}\".format(\n", - " metric, min(metric_scores), mean_scores, max(metric_scores)\n", - " )\n", - " print(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "03fe96af", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Re-show the examples for which the chain failed to complete\n", - "failed_examples" - ] - }, - { - "cell_type": "markdown", - "id": "2bb3636d", - "metadata": {}, - "source": [ - "## Generating Test Datasets\n", - "\n", - "To evaluate a chain against your own endpoint, you'll want to generate a test dataset that's conforms to the API.\n", - "\n", - "This section provides an overview of how to bootstrap the process.\n", - "\n", - "First, we'll parse the OpenAPI Spec. For this example, we'll [Speak](https://www.speak.com/)'s OpenAPI specification." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "a453eb93", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Attempting to load an OpenAPI 3.0.1 spec. This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n", - "Attempting to load an OpenAPI 3.0.1 spec. This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.\n" - ] - } - ], - "source": [ - "# Load and parse the OpenAPI Spec\n", - "spec = OpenAPISpec.from_url(\"https://api.speak.com/openapi.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "bb65ffe8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['/v1/public/openai/explain-phrase',\n", - " '/v1/public/openai/explain-task',\n", - " '/v1/public/openai/translate']" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# List the paths in the OpenAPI Spec\n", - "paths = sorted(spec.paths.keys())\n", - "paths" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "0988f01b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['post']" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# See which HTTP Methods are available for a given path\n", - "methods = spec.get_methods_for_path(\"/v1/public/openai/explain-task\")\n", - "methods" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "e9ef0a77", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "type explainTask = (_: {\n", - "/* Description of the task that the user wants to accomplish or do. For example, \"tell the waiter they messed up my order\" or \"compliment someone on their shirt\" */\n", - " task_description?: string,\n", - "/* The foreign language that the user is learning and asking about. The value can be inferred from question - for example, if the user asks \"how do i ask a girl out in mexico city\", the value should be \"Spanish\" because of Mexico City. Always use the full name of the language (e.g. Spanish, French). */\n", - " learning_language?: string,\n", - "/* The user's native language. Infer this value from the language the user asked their question in. Always use the full name of the language (e.g. Spanish, French). */\n", - " native_language?: string,\n", - "/* A description of any additional context in the user's question that could affect the explanation - e.g. setting, scenario, situation, tone, speaking style and formality, usage notes, or any other qualifiers. */\n", - " additional_context?: string,\n", - "/* Full text of the user's question. */\n", - " full_query?: string,\n", - "}) => any;\n" - ] - } - ], - "source": [ - "# Load a single endpoint operation\n", - "operation = APIOperation.from_openapi_spec(\n", - " spec, \"/v1/public/openai/explain-task\", \"post\"\n", - ")\n", - "\n", - "# The operation can be serialized as typescript\n", - "print(operation.to_typescript())" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "f1186b6d", - "metadata": {}, - "outputs": [], - "source": [ - "# Compress the service definition to avoid leaking too much input structure to the sample data\n", - "template = \"\"\"In 20 words or less, what does this service accomplish?\n", - "{spec}\n", - "\n", - "Function: It's designed to \"\"\"\n", - "prompt = PromptTemplate.from_template(template)\n", - "generation_chain = LLMChain(llm=llm, prompt=prompt)\n", - "purpose = generation_chain.run(spec=operation.to_typescript())" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "a594406a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"Can you explain how to say 'hello' in Spanish?\",\n", - " \"I need help understanding the French word for 'goodbye'.\",\n", - " \"Can you tell me how to say 'thank you' in German?\",\n", - " \"I'm trying to learn the Italian word for 'please'.\",\n", - " \"Can you help me with the pronunciation of 'yes' in Portuguese?\",\n", - " \"I'm looking for the Dutch word for 'no'.\",\n", - " \"Can you explain the meaning of 'hello' in Japanese?\",\n", - " \"I need help understanding the Russian word for 'thank you'.\",\n", - " \"Can you tell me how to say 'goodbye' in Chinese?\",\n", - " \"I'm trying to learn the Arabic word for 'please'.\"]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "template = \"\"\"Write a list of {num_to_generate} unique messages users might send to a service designed to{purpose} They must each be completely unique.\n", - "\n", - "1.\"\"\"\n", - "\n", - "\n", - "def parse_list(text: str) -> List[str]:\n", - " # Match lines starting with a number then period\n", - " # Strip leading and trailing whitespace\n", - " matches = re.findall(r\"^\\d+\\. \", text)\n", - " return [re.sub(r\"^\\d+\\. \", \"\", q).strip().strip('\"') for q in text.split(\"\\n\")]\n", - "\n", - "\n", - "num_to_generate = 10 # How many examples to use for this test set.\n", - "prompt = PromptTemplate.from_template(template)\n", - "generation_chain = LLMChain(llm=llm, prompt=prompt)\n", - "text = generation_chain.run(purpose=purpose, num_to_generate=num_to_generate)\n", - "# Strip preceding numeric bullets\n", - "queries = parse_list(text)\n", - "queries" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "8dc60f43", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n", - " '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n", - " '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n", - " '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n", - " '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n", - " '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n", - " '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n", - " '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n", - " '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n", - " '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Define the generation chain to get hypotheses\n", - "api_chain = OpenAPIEndpointChain.from_api_operation(\n", - " operation,\n", - " llm,\n", - " requests=Requests(),\n", - " verbose=verbose,\n", - " return_intermediate_steps=True, # Return request and response text\n", - ")\n", - "\n", - "predicted_outputs = [api_chain(query) for query in queries]\n", - "request_args = [\n", - " output[\"intermediate_steps\"][\"request_args\"] for output in predicted_outputs\n", - "]\n", - "\n", - "# Show the generated request\n", - "request_args" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "b727e28e", - "metadata": {}, - "outputs": [], - "source": [ - "## AI Assisted Correction\n", - "correction_template = \"\"\"Correct the following API request based on the user's feedback. If the user indicates no changes are needed, output the original without making any changes.\n", - "\n", - "REQUEST: {request}\n", - "\n", - "User Feedback / requested changes: {user_feedback}\n", - "\n", - "Finalized Request: \"\"\"\n", - "\n", - "prompt = PromptTemplate.from_template(correction_template)\n", - "correction_chain = LLMChain(llm=llm, prompt=prompt)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "c1f4d71f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Query: Can you explain how to say 'hello' in Spanish?\n", - "Request: {\"task_description\": \"say 'hello'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say 'hello' in Spanish?\"}\n", - "Requested changes: \n", - "Query: I need help understanding the French word for 'goodbye'.\n", - "Request: {\"task_description\": \"understanding the French word for 'goodbye'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for 'goodbye'.\"}\n", - "Requested changes: \n", - "Query: Can you tell me how to say 'thank you' in German?\n", - "Request: {\"task_description\": \"say 'thank you'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'thank you' in German?\"}\n", - "Requested changes: \n", - "Query: I'm trying to learn the Italian word for 'please'.\n", - "Request: {\"task_description\": \"Learn the Italian word for 'please'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Italian word for 'please'.\"}\n", - "Requested changes: \n", - "Query: Can you help me with the pronunciation of 'yes' in Portuguese?\n", - "Request: {\"task_description\": \"Help with pronunciation of 'yes' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of 'yes' in Portuguese?\"}\n", - "Requested changes: \n", - "Query: I'm looking for the Dutch word for 'no'.\n", - "Request: {\"task_description\": \"Find the Dutch word for 'no'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I'm looking for the Dutch word for 'no'.\"}\n", - "Requested changes: \n", - "Query: Can you explain the meaning of 'hello' in Japanese?\n", - "Request: {\"task_description\": \"Explain the meaning of 'hello' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of 'hello' in Japanese?\"}\n", - "Requested changes: \n", - "Query: I need help understanding the Russian word for 'thank you'.\n", - "Request: {\"task_description\": \"understanding the Russian word for 'thank you'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for 'thank you'.\"}\n", - "Requested changes: \n", - "Query: Can you tell me how to say 'goodbye' in Chinese?\n", - "Request: {\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say 'goodbye' in Chinese?\"}\n", - "Requested changes: \n", - "Query: I'm trying to learn the Arabic word for 'please'.\n", - "Request: {\"task_description\": \"Learn the Arabic word for 'please'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I'm trying to learn the Arabic word for 'please'.\"}\n", - "Requested changes: \n" - ] - } - ], - "source": [ - "ground_truth = []\n", - "for query, request_arg in list(zip(queries, request_args)):\n", - " feedback = input(f\"Query: {query}\\nRequest: {request_arg}\\nRequested changes: \")\n", - " if feedback == \"n\" or feedback == \"none\" or not feedback:\n", - " ground_truth.append(request_arg)\n", - " continue\n", - " resolved = correction_chain.run(request=request_arg, user_feedback=feedback)\n", - " ground_truth.append(resolved.strip())\n", - " print(\"Updated request:\", resolved)" - ] - }, - { - "cell_type": "markdown", - "id": "19d68882", - "metadata": {}, - "source": [ - "**Now you can use the `ground_truth` as shown above in [Evaluate the Requests Chain](#Evaluate-the-requests-chain)!**" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "5a596176", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['{\"task_description\": \"say \\'hello\\'\", \"learning_language\": \"Spanish\", \"native_language\": \"English\", \"full_query\": \"Can you explain how to say \\'hello\\' in Spanish?\"}',\n", - " '{\"task_description\": \"understanding the French word for \\'goodbye\\'\", \"learning_language\": \"French\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the French word for \\'goodbye\\'.\"}',\n", - " '{\"task_description\": \"say \\'thank you\\'\", \"learning_language\": \"German\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'thank you\\' in German?\"}',\n", - " '{\"task_description\": \"Learn the Italian word for \\'please\\'\", \"learning_language\": \"Italian\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Italian word for \\'please\\'.\"}',\n", - " '{\"task_description\": \"Help with pronunciation of \\'yes\\' in Portuguese\", \"learning_language\": \"Portuguese\", \"native_language\": \"English\", \"full_query\": \"Can you help me with the pronunciation of \\'yes\\' in Portuguese?\"}',\n", - " '{\"task_description\": \"Find the Dutch word for \\'no\\'\", \"learning_language\": \"Dutch\", \"native_language\": \"English\", \"full_query\": \"I\\'m looking for the Dutch word for \\'no\\'.\"}',\n", - " '{\"task_description\": \"Explain the meaning of \\'hello\\' in Japanese\", \"learning_language\": \"Japanese\", \"native_language\": \"English\", \"full_query\": \"Can you explain the meaning of \\'hello\\' in Japanese?\"}',\n", - " '{\"task_description\": \"understanding the Russian word for \\'thank you\\'\", \"learning_language\": \"Russian\", \"native_language\": \"English\", \"full_query\": \"I need help understanding the Russian word for \\'thank you\\'.\"}',\n", - " '{\"task_description\": \"say goodbye\", \"learning_language\": \"Chinese\", \"native_language\": \"English\", \"full_query\": \"Can you tell me how to say \\'goodbye\\' in Chinese?\"}',\n", - " '{\"task_description\": \"Learn the Arabic word for \\'please\\'\", \"learning_language\": \"Arabic\", \"native_language\": \"English\", \"full_query\": \"I\\'m trying to learn the Arabic word for \\'please\\'.\"}']" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Now you have a new ground truth set to use as shown above!\n", - "ground_truth" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7fe9dfa", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/modules/evaluation/qa_benchmarking_pg.ipynb b/docs/extras/modules/evaluation/qa_benchmarking_pg.ipynb deleted file mode 100644 index 8267ce82210..00000000000 --- a/docs/extras/modules/evaluation/qa_benchmarking_pg.ipynb +++ /dev/null @@ -1,385 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "984169ca", - "metadata": {}, - "source": [ - "# Question Answering Benchmarking: Paul Graham Essay\n", - "\n", - "Here we go over how to benchmark performance on a question answering task over a Paul Graham essay.\n", - "\n", - "It is highly reccomended that you do any evaluation/benchmarking with tracing enabled. See [here](https://langchain.readthedocs.io/en/latest/tracing.html) for an explanation of what tracing is and how to set it up." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3bd13ab7", - "metadata": {}, - "outputs": [], - "source": [ - "# Comment this out if you are NOT using tracing\n", - "import os\n", - "\n", - "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\"" - ] - }, - { - "cell_type": "markdown", - "id": "8a16b75d", - "metadata": {}, - "source": [ - "## Loading the data\n", - "First, let's load the data." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5b2d5e98", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset json (/Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--question-answering-paul-graham-76e8f711e038d742/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9264acfe710b4faabf060f0fcf4f7308", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE();\n", + "\"\"\",\n", + " reference=\"\"\"SELECT SUM(sub.sale_amount) AS last_quarter_sales\n", + "FROM (\n", + " SELECT sale_amount\n", + " FROM sales\n", + " WHERE sale_date >= DATEADD(quarter, -1, GETDATE()) AND sale_date < GETDATE()\n", + ") AS sub;\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e0c3dcad-408e-4d26-9e25-848ebacac2c4", + "metadata": {}, + "source": [ + "## Using Context\n", + "\n", + "Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the ContextQAEvalChain." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9f3ae116-3a2f-461d-ba6f-7352b42c1b0c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reasoning': None, 'value': 'CORRECT', 'score': 1}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.evaluation import ContextQAEvalChain\n", + "\n", + "eval_chain = ContextQAEvalChain.from_llm(llm=llm)\n", + "\n", + "eval_chain.evaluate_strings(\n", + " input=\"Who won the NFC championship game in 2023?\",\n", + " prediction=\"Eagles\",\n", + " reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ba5eac17-08b6-4e4f-a896-79e7fc637018", + "metadata": {}, + "source": [ + "## CoT With Context\n", + "\n", + "The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.\n", + "The `CotQAEvalChain`'s default prompt instructs the model to do this." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "26e3b686-98f4-45a5-9854-7071ec2893f1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reasoning': 'The context states that the Philadelphia Eagles won the NFC championship game in 2023. The student\\'s answer, \"Eagles,\" matches the team that won according to the context. Therefore, the student\\'s answer is correct.',\n", + " 'value': 'CORRECT',\n", + " 'score': 1}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.evaluation import CotQAEvalChain\n", + "\n", + "eval_chain = CotQAEvalChain.from_llm(llm=llm)\n", + "\n", + "eval_chain.evaluate_strings(\n", + " input=\"Who won the NFC championship game in 2023?\",\n", + " prediction=\"Eagles\",\n", + " reference=\"NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7\",\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb b/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb new file mode 100644 index 00000000000..25f6d1bdd5e --- /dev/null +++ b/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6e5ea1a1-7e74-459b-bf14-688f87d09124", + "metadata": { + "tags": [] + }, + "source": [ + "# Agent Trajectory\n", + "\n", + "Agents take actions in pursuit of a goal. \"Trajectories\" record the intermediate steps\n", + "taken by the agent. You can use the the `TrajectoryEvalChain` to grade how effective these steps\n", + "are at achieving the correct response." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "149402da-5212-43e2-b7c0-a701727f5293", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.evaluation import TrajectoryEvalChain\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n", + "chain = TrajectoryEvalChain.from_llm(llm)" + ] + }, + { + "cell_type": "markdown", + "id": "e733562c-4c17-4942-9647-acfc5ebfaca2", + "metadata": {}, + "source": [ + "## Capturing Trajectory\n", + "\n", + "To return the trajectory, initialize an agent with `return_intermediate_steps=True`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from langchain.tools import tool\n", + "from langchain.agents import AgentType, initialize_agent\n", + "from pydantic import HttpUrl\n", + "import subprocess\n", + "from urllib.parse import urlparse\n", + "\n", + "@tool\n", + "def ping(url: HttpUrl, return_error: bool) -> str:\n", + " \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n", + " hostname = urlparse(str(url)).netloc\n", + " completed_process = subprocess.run(['ping', '-c', '1', hostname], capture_output=True, text=True)\n", + " output = completed_process.stdout\n", + " if return_error and completed_process.returncode != 0:\n", + " return completed_process.stderr\n", + " return output\n", + "\n", + "@tool\n", + "def trace_route(url: HttpUrl, return_error: bool) -> str:\n", + " \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n", + " hostname = urlparse(str(url)).netloc\n", + " completed_process = subprocess.run(['traceroute', hostname], capture_output=True, text=True)\n", + " output = completed_process.stdout\n", + " if return_error and completed_process.returncode != 0:\n", + " return completed_process.stderr\n", + " return output\n", + "\n", + "\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n", + "agent = initialize_agent(\n", + " llm=llm,\n", + " tools=[ping, trace_route],\n", + " agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n", + " return_intermediate_steps=True # IMPORTANT!\n", + ")\n", + "\n", + "result = agent(\"What's the latency like for https://langchain.com?\")" + ] + }, + { + "cell_type": "markdown", + "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a", + "metadata": {}, + "source": [ + "## Evaluate Trajectory\n", + "\n", + "Pass the input, trajectory, and output to the `evaluate_agent_trajectory` function." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'grade'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 6\u001b[0m\n\u001b[1;32m 1\u001b[0m evaluation_result \u001b[38;5;241m=\u001b[39m chain\u001b[38;5;241m.\u001b[39mevaluate_agent_trajectory(\n\u001b[1;32m 2\u001b[0m prediction\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28minput\u001b[39m\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 4\u001b[0m agent_trajectory\u001b[38;5;241m=\u001b[39mresult[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mintermediate_steps\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 5\u001b[0m )\n\u001b[0;32m----> 6\u001b[0m \u001b[43mevaluation_result\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrade\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[0;31mKeyError\u001b[0m: 'grade'" + ] + } + ], + "source": [ + "evaluation_result = chain.evaluate_agent_trajectory(\n", + " prediction=result[\"output\"],\n", + " input=result[\"input\"],\n", + " agent_trajectory=result[\"intermediate_steps\"],\n", + ")\n", + "evaluation_result[\"grade\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652f3e76-9f3e-40e3-bbf8-e62c37e447ac", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py index 184bcbfcee4..c81a7317b8b 100644 --- a/langchain/evaluation/agents/trajectory_eval_chain.py +++ b/langchain/evaluation/agents/trajectory_eval_chain.py @@ -9,6 +9,7 @@ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union from pydantic import Field +from langchain.base_language import BaseLanguageModel from langchain.callbacks.manager import ( AsyncCallbackManagerForChainRun, CallbackManagerForChainRun, @@ -186,10 +187,11 @@ The following is the expected answer. Use this to measure correctness: @classmethod def from_llm( cls, - llm: BaseChatModel, + llm: BaseLanguageModel, agent_tools: Optional[Sequence[BaseTool]] = None, output_parser: Optional[TrajectoryOutputParser] = None, return_reasoning: bool = False, + **kwargs: Any, ) -> "TrajectoryEvalChain": """Create a TrajectoryEvalChain object from a language model chain. @@ -205,6 +207,10 @@ The following is the expected answer. Use this to measure correctness: Returns: TrajectoryEvalChain: The TrajectoryEvalChain object. """ + if not isinstance(llm, BaseChatModel): + raise NotImplementedError( + "Only chat models supported by the current trajectory eval" + ) if agent_tools: prompt = EVAL_CHAT_PROMPT else: @@ -215,6 +221,7 @@ The following is the expected answer. Use this to measure correctness: return_reasoning=return_reasoning, eval_chain=eval_chain, output_parser=output_parser or TrajectoryOutputParser(), + **kwargs, ) @property diff --git a/langchain/evaluation/comparison/eval_chain.py b/langchain/evaluation/comparison/eval_chain.py index 3c3f4f666bf..53837fb90df 100644 --- a/langchain/evaluation/comparison/eval_chain.py +++ b/langchain/evaluation/comparison/eval_chain.py @@ -9,6 +9,7 @@ from langchain.base_language import BaseLanguageModel from langchain.callbacks.manager import Callbacks from langchain.chains.llm import LLMChain from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE +from langchain.evaluation.schema import PairwiseStringEvaluator from langchain.prompts.prompt import PromptTemplate from langchain.schema import BaseOutputParser @@ -50,7 +51,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]): } -class PairwiseStringEvalChain(LLMChain): +class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMChain): """A chain for comparing the output of two models. Example: @@ -80,13 +81,31 @@ class PairwiseStringEvalChain(LLMChain): default_factory=PairwiseStringResultOutputParser ) + @property + def requires_reference(self) -> bool: + return "reference" in self.prompt.input_variables + + @property + def requires_input(self) -> bool: + return True + + @property + def _skip_reference_warning(self) -> str: + """Warning to show when reference is ignored.""" + return ( + f"Ignoring reference in {self.__class__.__name__}, as it is not expected." + "\nTo use a reference, initialize PairwiseStringEvalChain with" + " `requires_reference=True` or with a prompt with 'reference' as an" + " input variable." + ) + @classmethod def from_llm( cls, - *, llm: BaseLanguageModel, + *, prompt: Optional[PromptTemplate] = None, - require_reference: bool = False, + requires_reference: bool = False, **kwargs: Any, ) -> PairwiseStringEvalChain: """Initialize the PairwiseStringEvalChain from an LLM. @@ -94,7 +113,7 @@ class PairwiseStringEvalChain(LLMChain): Args: llm (BaseLanguageModel): The LLM to use. prompt (PromptTemplate, optional): The prompt to use. - require_reference (bool, optional): Whether to require a reference + requires_reference (bool, optional): Whether to require a reference string. Defaults to False. **kwargs (Any): Additional keyword arguments. @@ -103,13 +122,13 @@ class PairwiseStringEvalChain(LLMChain): """ expected_input_vars = {"prediction", "prediction_b", "input"} if prompt is None: - if require_reference: + if requires_reference: expected_input_vars.add("reference") prompt_ = PROMPT_WITH_REFERENCE else: prompt_ = PROMPT else: - if require_reference: + if requires_reference: expected_input_vars.add("reference") prompt_ = prompt @@ -121,23 +140,32 @@ class PairwiseStringEvalChain(LLMChain): return cls(llm=llm, prompt=prompt_, **kwargs) def _prepare_input( - self, prediction: str, prediction_b: str, input: str, reference: Optional[str] + self, + prediction: str, + prediction_b: str, + input: Optional[str], + reference: Optional[str], ) -> dict: input_ = { "prediction": prediction, "prediction_b": prediction_b, - "input": input, } - if reference is not None and "reference" in self.prompt.input_variables: + if self.requires_input: + if input is None: + raise ValueError("Input is required for this comparison evaluator") + input_["input"] = input + if self.requires_reference: + if reference is None: + raise ValueError("Reference is required for this comparison evaluator") input_["reference"] = reference return input_ - def evaluate_string_pairs( + def _evaluate_string_pairs( self, *, prediction: str, prediction_b: str, - input: str, + input: Optional[str] = None, reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, @@ -168,12 +196,12 @@ class PairwiseStringEvalChain(LLMChain): ) return result["text"] - async def aevaluate_string_pairs( + async def _aevaluate_string_pairs( self, *, prediction: str, prediction_b: str, - input: str, + input: Optional[str] = None, reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, diff --git a/langchain/evaluation/criteria/eval_chain.py b/langchain/evaluation/criteria/eval_chain.py index c7d6a1f7c4a..067fb6c543a 100644 --- a/langchain/evaluation/criteria/eval_chain.py +++ b/langchain/evaluation/criteria/eval_chain.py @@ -2,12 +2,13 @@ from __future__ import annotations from typing import Any, Dict, List, Mapping, Optional, Sequence, Union -from pydantic import Field +from pydantic import Extra, Field from langchain.base_language import BaseLanguageModel from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple from langchain.chains.llm import LLMChain from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES +from langchain.evaluation.schema import StringEvaluator from langchain.schema import BaseOutputParser, BasePromptTemplate _SUPPORTED_CRITERIA = { @@ -59,7 +60,7 @@ CRITERIA_TYPE = Union[ ] -class CriteriaEvalChain(LLMChain): +class CriteriaEvalChain(StringEvaluator, LLMChain): """LLM Chain for evaluating runs against criteria. Parameters @@ -96,11 +97,32 @@ class CriteriaEvalChain(LLMChain): >>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria) """ - requires_reference: bool = False - """Whether the evaluation template expects a reference text.""" output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser) """The parser to use to map the output to a structured result.""" + class Config: + """Configuration for the QAEvalChain.""" + + extra = Extra.ignore + + @property + def requires_reference(self) -> bool: + return "reference" in self.prompt.input_variables + + @property + def requires_input(self) -> bool: + return True + + @property + def _skip_reference_warning(self) -> str: + """Warning to show when reference is ignored.""" + return ( + f"Ignoring reference in {self.__class__.__name__}, as it is not expected." + "\nTo use a reference, initialize CriteriaEvalChain with" + " `require_reference=True` or with a prompt with 'reference'" + " as an input variable." + ) + @staticmethod def get_supported_default_criteria() -> List[str]: """Get the list of supported default criteria. @@ -122,7 +144,7 @@ class CriteriaEvalChain(LLMChain): @classmethod def resolve_criteria( cls, - criteria: CRITERIA_TYPE, + criteria: Optional[CRITERIA_TYPE], ) -> Dict[str, str]: """Resolve the criteria to evaluate. @@ -148,6 +170,10 @@ class CriteriaEvalChain(LLMChain): {'relevance': 'Is the submission referring to a real quote from the text?', 'coherence': 'Is the submission coherent, well-structured, and organized?'} """ # noqa: E501 + if criteria is None: + return { + "helpfulness": _SUPPORTED_CRITERIA["helpfulness"], + } if isinstance(criteria, str): criteria_ = {criteria: _SUPPORTED_CRITERIA[criteria]} elif isinstance(criteria, ConstitutionalPrinciple): @@ -172,7 +198,7 @@ class CriteriaEvalChain(LLMChain): def from_llm( cls, llm: BaseLanguageModel, - criteria: CRITERIA_TYPE, + criteria: Optional[CRITERIA_TYPE] = None, *, prompt: Optional[BasePromptTemplate] = None, requires_reference: bool = False, @@ -184,7 +210,7 @@ class CriteriaEvalChain(LLMChain): ---------- llm : BaseLanguageModel The language model to use for evaluation. - criteria : CRITERIA_TYPE + criteria : CRITERIA_TYPE - default=None for "helpfulness" The criteria to evaluate the runs against. It can be: - a mapping of criterion names to descriptions - a sequence of criterion names @@ -252,7 +278,7 @@ class CriteriaEvalChain(LLMChain): input_["reference"] = reference return input_ - def evaluate_strings( + def _evaluate_strings( self, *, prediction: str, @@ -296,7 +322,7 @@ class CriteriaEvalChain(LLMChain): input_ = self._get_eval_input(prediction, reference, input) return self(input_, **kwargs)["text"] - async def aevaluate_strings( + async def _aevaluate_strings( self, *, prediction: str, diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py index bba4c298263..ce90b48f0ac 100644 --- a/langchain/evaluation/qa/eval_chain.py +++ b/langchain/evaluation/qa/eval_chain.py @@ -8,6 +8,7 @@ from langchain.base_language import BaseLanguageModel from langchain.callbacks.manager import Callbacks from langchain.chains.llm import LLMChain from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT +from langchain.evaluation.schema import StringEvaluator def _parse_string_eval_output(text: str) -> dict: @@ -26,6 +27,8 @@ def _parse_string_eval_output(text: str) -> dict: else: reasoning, verdict = splits reasoning = reasoning.strip() + if ":" in verdict: + verdict = verdict.split(":")[1].strip() score = ( 1 if verdict.upper() == "CORRECT" @@ -38,9 +41,17 @@ def _parse_string_eval_output(text: str) -> dict: } -class QAEvalChain(LLMChain): +class QAEvalChain(LLMChain, StringEvaluator): """LLM Chain specifically for evaluating question answering.""" + @property + def requires_reference(self) -> bool: + return True + + @property + def requires_input(self) -> bool: + return True + @classmethod def from_llm( cls, llm: BaseLanguageModel, prompt: PromptTemplate = PROMPT, **kwargs: Any @@ -90,7 +101,7 @@ class QAEvalChain(LLMChain): return self.apply(inputs, callbacks=callbacks) - def evaluate_strings( + def _evaluate_strings( self, *, prediction: str, @@ -118,7 +129,7 @@ class QAEvalChain(LLMChain): )[0] return _parse_string_eval_output(result["text"]) - async def aevaluate_strings( + async def _aevaluate_strings( self, *, prediction: str, @@ -134,9 +145,17 @@ class QAEvalChain(LLMChain): return _parse_string_eval_output(result["text"]) -class ContextQAEvalChain(LLMChain): +class ContextQAEvalChain(LLMChain, StringEvaluator): """LLM Chain specifically for evaluating QA w/o GT based on context""" + @property + def requires_reference(self) -> bool: + return True + + @property + def requires_input(self) -> bool: + return True + @classmethod def _validate_input_vars(cls, prompt: PromptTemplate) -> None: expected_input_vars = {"query", "context", "result"} @@ -193,7 +212,7 @@ class ContextQAEvalChain(LLMChain): return self.apply(inputs, callbacks=callbacks) - def evaluate_strings( + def _evaluate_strings( self, *, prediction: str, @@ -208,7 +227,7 @@ class ContextQAEvalChain(LLMChain): )[0] return _parse_string_eval_output(result["text"]) - async def aevaluate_strings( + async def _aevaluate_strings( self, *, prediction: str, diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index 8c3362088ed..4bcfc513079 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -1,14 +1,63 @@ """Interfaces to be implemented by general evaluators.""" -from abc import abstractmethod -from typing import Any, Optional, Protocol, runtime_checkable +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import Any, Optional +from warnings import warn + +logger = logging.getLogger(__name__) -@runtime_checkable -class StringEvaluator(Protocol): +class _EvalArgsMixin: + """Mixin for checking evaluation arguments.""" + + @property + def requires_reference(self) -> bool: + """Whether this evaluator requires a reference label.""" + return False + + @property + def requires_input(self) -> bool: + """Whether this evaluator requires an input string.""" + return False + + @property + def _skip_input_warning(self) -> str: + """Warning to show when input is ignored.""" + return f"Ignoring input in {self.__class__.__name__}, as it is not expected." + + @property + def _skip_reference_warning(self) -> str: + """Warning to show when reference is ignored.""" + return ( + f"Ignoring reference in {self.__class__.__name__}, as it is not expected." + ) + + def _check_evaluation_args( + self, + reference: Optional[str] = None, + input: Optional[str] = None, + ) -> None: + if self.requires_input and input is None: + raise ValueError(f"{self.__class__.__name__} requires an input string.") + elif input is not None and not self.requires_input: + warn(self._skip_input_warning) + else: + pass + if self.requires_reference and reference is None: + raise ValueError(f"{self.__class__.__name__} requires a reference string.") + elif reference is not None and not self.requires_reference: + warn(self._skip_reference_warning) + else: + pass + + +class StringEvaluator(_EvalArgsMixin, ABC): """Protocol for evaluating strings.""" @abstractmethod - def evaluate_strings( + def _evaluate_strings( self, *, prediction: str, @@ -28,7 +77,7 @@ class StringEvaluator(Protocol): dict: The evaluation results containing the score or value. """ - async def aevaluate_strings( + async def _aevaluate_strings( self, *, prediction: str, @@ -53,13 +102,61 @@ class StringEvaluator(Protocol): "async aevaluate_strings method." ) + def evaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate Chain or LLM output, based on optional input and label. -@runtime_checkable -class PairwiseStringEvaluator(Protocol): + Args: + prediction (str): the LLM or chain prediction to evaluate. + reference (Optional[str], optional): the reference label + to evaluate against. + input (Optional[str], optional): the input to consider during evaluation + **kwargs: additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + """ + self._check_evaluation_args(reference=reference, input=input) + return self._evaluate_strings( + prediction=prediction, reference=reference, input=input, **kwargs + ) + + async def aevaluate_strings( + self, + *, + prediction: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate Chain or LLM output, based on optional + input and label. + + Args: + prediction (str): the LLM or chain prediction to evaluate. + reference (Optional[str], optional): the reference label + to evaluate against. + input (Optional[str], optional): the input to consider during evaluation + **kwargs: additional keyword arguments, including callbacks, tags, etc. + Returns: + dict: The evaluation results containing the score or value. + """ + self._check_evaluation_args(reference=reference, input=input) + return await self._aevaluate_strings( + prediction=prediction, reference=reference, input=input, **kwargs + ) + + +class PairwiseStringEvaluator(_EvalArgsMixin, ABC): """A protocol for comparing the output of two models.""" @abstractmethod - def evaluate_string_pairs( + def _evaluate_string_pairs( self, *, prediction: str, @@ -84,8 +181,9 @@ class PairwiseStringEvaluator(Protocol): other information. """ - async def aevaluate_string_pairs( + async def _aevaluate_string_pairs( self, + *, prediction: str, prediction_b: str, reference: Optional[str] = None, @@ -111,3 +209,69 @@ class PairwiseStringEvaluator(Protocol): f"{self.__class__.__name__} hasn't implemented an async " "aevaluate_string_pairs method." ) + + def evaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate the output string pairs. + + Args: + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. + reference (str, optional): The expected output / reference + string. Defaults to None. + input (str, optional): The input string. Defaults to None. + **kwargs (Any): Additional keyword arguments, such + as callbacks and optional reference strings. + + Returns: + dict: A dictionary containing the preference, scores, and/or + other information. + """ + self._check_evaluation_args(reference=reference, input=input) + return self._evaluate_string_pairs( + prediction=prediction, + prediction_b=prediction_b, + reference=reference, + input=input, + **kwargs, + ) + + async def aevaluate_string_pairs( + self, + *, + prediction: str, + prediction_b: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate the output string pairs. + + Args: + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. + reference (str, optional): The expected output / reference + string. Defaults to None. + input (str, optional): The input string. Defaults to None. + **kwargs (Any): Additional keyword arguments, such + as callbacks and optional reference strings. + + Returns: + dict: A dictionary containing the preference, scores, and/or + other information. + """ + self._check_evaluation_args(reference=reference, input=input) + return await self._aevaluate_string_pairs( + prediction=prediction, + prediction_b=prediction_b, + reference=reference, + input=input, + **kwargs, + ) diff --git a/tests/unit_tests/evaluation/agents/test_eval_chain.py b/tests/unit_tests/evaluation/agents/test_eval_chain.py index 59fa3de0173..c8c84ae574b 100644 --- a/tests/unit_tests/evaluation/agents/test_eval_chain.py +++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py @@ -1,13 +1,15 @@ """Test agent trajectory evaluation chain.""" -from typing import List, Tuple +from typing import Any, Dict, List, Optional, Tuple import pytest +from pydantic import Field +from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain -from langchain.schema import AgentAction +from langchain.schema import AgentAction, BaseMessage from langchain.tools.base import tool -from tests.unit_tests.llms.fake_llm import FakeLLM +from tests.unit_tests.llms.fake_chat_model import FakeChatModel @pytest.fixture @@ -30,10 +32,31 @@ def foo(bar: str) -> str: return bar +class _FakeTrajectoryChatModel(FakeChatModel): + queries: Dict = Field(default_factory=dict) + sequential_responses: Optional[bool] = False + response_index: int = 0 + + def _call( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> str: + if self.sequential_responses: + response = self.queries[list(self.queries.keys())[self.response_index]] + self.response_index = self.response_index + 1 + return response + else: + prompt = messages[0].content + return self.queries[prompt] + + def test_trajectory_eval_chain( intermediate_steps: List[Tuple[AgentAction, str]] ) -> None: - llm = FakeLLM( + llm = _FakeTrajectoryChatModel( queries={ "a": "Trajectory good\nScore: 5", "b": "Trajectory not good\nScore: 1", @@ -61,7 +84,7 @@ def test_trajectory_eval_chain( def test_trajectory_eval_chain_no_tools( intermediate_steps: List[Tuple[AgentAction, str]] ) -> None: - llm = FakeLLM( + llm = _FakeTrajectoryChatModel( queries={ "a": "Trajectory good\nScore: 5", "b": "Trajectory not good\nScore: 1", @@ -85,7 +108,7 @@ def test_trajectory_eval_chain_no_tools( def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> None: - llm = FakeLLM( + llm = _FakeTrajectoryChatModel( queries={ "a": "Trajectory good\nScore: 5", "b": "Trajectory not good\nScore: 1", diff --git a/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/tests/unit_tests/evaluation/comparison/test_eval_chain.py index 4a96b43e18c..4eb2508a9e4 100644 --- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py +++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py @@ -1,6 +1,8 @@ """Test the comparison chains.""" +import pytest + from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain from tests.unit_tests.llms.fake_llm import FakeLLM @@ -30,10 +32,30 @@ def test_pairwise_string_comparison_chain() -> None: ) assert res["value"] == "A" assert res["score"] == 1 - res = chain.evaluate_string_pairs( - prediction="I like pie.", - prediction_b="I hate pie.", - input="What is your favorite food?", - ) + with pytest.warns(UserWarning, match=chain._skip_reference_warning): + res = chain.evaluate_string_pairs( + prediction="I like pie.", + prediction_b="I hate pie.", + input="What is your favorite food?", + reference="I enjoy pie.", + ) assert res["value"] == "B" assert res["score"] == 0 + + +def test_pairwise_string_comparison_chain_missing_ref() -> None: + llm = FakeLLM( + queries={ + "a": "The values are the same.\n[[C]]", + "b": "A is clearly better than b.\n[[A]]", + "c": "B is clearly better than a.\n[[B]]", + }, + sequential_responses=True, + ) + chain = PairwiseStringEvalChain.from_llm(llm=llm, requires_reference=True) + with pytest.raises(ValueError): + chain.evaluate_string_pairs( + prediction="I like pie.", + prediction_b="I love pie.", + input="What is your favorite food?", + ) diff --git a/tests/unit_tests/evaluation/criteria/test_eval_chain.py b/tests/unit_tests/evaluation/criteria/test_eval_chain.py index f978fa70e77..56a892ceccc 100644 --- a/tests/unit_tests/evaluation/criteria/test_eval_chain.py +++ b/tests/unit_tests/evaluation/criteria/test_eval_chain.py @@ -1,6 +1,8 @@ """Test the criteria eval chain.""" +import pytest + from langchain.evaluation.criteria.eval_chain import ( _SUPPORTED_CRITERIA, CriteriaEvalChain, @@ -25,11 +27,25 @@ def test_criteria_eval_chain() -> None: ), criteria={"my criterion": "my criterion description"}, ) - result = chain.evaluate_strings( - prediction="my prediction", reference="my reference", input="my input" - ) + with pytest.warns(UserWarning, match=chain._skip_reference_warning): + result = chain.evaluate_strings( + prediction="my prediction", reference="my reference", input="my input" + ) assert result["reasoning"] == "The meaning of life" +def test_criteria_eval_chain_missing_reference() -> None: + chain = CriteriaEvalChain.from_llm( + llm=FakeLLM( + queries={"text": "The meaning of life\nY"}, + sequential_responses=True, + ), + requires_reference=True, + criteria={"my criterion": "my criterion description"}, + ) + with pytest.raises(ValueError): + chain.evaluate_strings(prediction="my prediction", input="my input") + + def test_implements_string_protocol() -> None: - assert isinstance(CriteriaEvalChain, StringEvaluator) + assert issubclass(CriteriaEvalChain, StringEvaluator) diff --git a/tests/unit_tests/evaluation/qa/test_eval_chain.py b/tests/unit_tests/evaluation/qa/test_eval_chain.py index 514fd28757c..acdad692e3f 100644 --- a/tests/unit_tests/evaluation/qa/test_eval_chain.py +++ b/tests/unit_tests/evaluation/qa/test_eval_chain.py @@ -52,7 +52,7 @@ def test_context_eval_chain(chain_cls: Type[ContextQAEvalChain]) -> None: def test_implements_string_evaluator_protocol( chain_cls: Type[LLMChain], ) -> None: - assert isinstance(chain_cls, StringEvaluator) + assert issubclass(chain_cls, StringEvaluator) @pytest.mark.parametrize("chain_cls", [QAEvalChain, ContextQAEvalChain, CotQAEvalChain])