diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb new file mode 100644 index 00000000000..28da9942a83 --- /dev/null +++ b/docs/extras/guides/evaluation/comparisons.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing Chain Outputs\n", + "\n", + "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n", + "\n", + "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`[[1]](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n", + "\n", + "For this evalution, we will need 3 things:\n", + "1. An evaluator\n", + "2. A dataset of inputs\n", + "3. 2 (or more) LLMs, Chains, or Agents to compare\n", + "\n", + "Then we will aggregate the restults to determine the preferred model.\n", + "\n", + "### Step 1. Create the Evaluator\n", + "\n", + "In this example, you will use gpt-4 to select which output is preferred." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional if you are tracing the notebook\n", + "%env LANGCHAIN_PROJECT=\"Comparing Chain Outputs\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.evaluation.comparison import PairwiseStringEvalChain\n", + "\n", + "llm = ChatOpenAI(model=\"gpt-4\")\n", + "\n", + "eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Select Dataset\n", + "\n", + "If you already have real usage data for your LLM, you can use a representative sample. More examples\n", + "provide more reliable results. We will use some example queries someone might have about how to use langchain here." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d852a1884480457292c90d8bd9d4f1e6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\" \n", + "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n", + "\n", + "# Initialize the SerpAPIWrapper for search functionality\n", + "#Replace in openai_api_key=\"\" with your actual SerpAPI key.\n", + "search = SerpAPIWrapper()\n", + "\n", + "# Define a list of tools offered by the agent\n", + "tools = [\n", + " Tool(\n", + " name=\"Search\",\n", + " func=search.run,\n", + " coroutine=search.arun,\n", + " description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\"\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "functions_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False)\n", + "conversations_agent = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "list(zip(*[iter(batch_results)]*2)### Step 4. Generate Responses\n", + "\n", + "We will generate outputs for each of the models before evaluating them." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b076d6bf6680422aa9082d4bad4d98a3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/20 [00:00._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n", + "Retrying langchain.chat_models.openai.acompletion_with_retry.._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n" + ] + } + ], + "source": [ + "from tqdm.notebook import tqdm\n", + "import asyncio\n", + "\n", + "results = []\n", + "agents = [functions_agent, conversations_agent]\n", + "concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n", + "\n", + "# We will only run the first 20 examples of this dataset to speed things up\n", + "# This will lead to larger confidence intervals downstream.\n", + "batch = []\n", + "for example in tqdm(dataset[:20]):\n", + " batch.extend([agent.acall(example['inputs']) for agent in agents])\n", + " if len(batch) >= concurrency_level:\n", + " batch_results = await asyncio.gather(*batch, return_exceptions=True)\n", + " results.extend(list(zip(*[iter(batch_results)]*2)))\n", + " batch = []\n", + "if batch:\n", + " batch_results = await asyncio.gather(*batch, return_exceptions=True)\n", + " results.extend(list(zip(*[iter(batch_results)]*2)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5. Evaluate Pairs\n", + "\n", + "Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n", + "\n", + "Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def predict_preferences(dataset, results) -> list:\n", + " preferences = []\n", + "\n", + " for example, (res_a, res_b) in zip(dataset, results):\n", + " input_ = example['inputs']\n", + " # Flip a coin to reduce persistent position bias\n", + " if random.random() < 0.5:\n", + " pred_a, pred_b = res_a, res_b\n", + " a, b = \"a\", \"b\"\n", + " else:\n", + " pred_a, pred_b = res_b, res_a\n", + " a, b = \"b\", \"a\"\n", + " eval_res = eval_chain.evaluate_string_pairs(\n", + " output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n", + " output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n", + " input=input_\n", + " )\n", + " if eval_res[\"value\"] == \"A\":\n", + " preferences.append(a)\n", + " elif eval_res[\"value\"] == \"B\":\n", + " preferences.append(b)\n", + " else:\n", + " preferences.append(None) # No preference\n", + " return preferences" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "preferences = predict_preferences(dataset, results)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "**Print out the ratio of preferences.**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenAI Functions Agent: 90.00%\n", + "Structured Chat Agent: 10.00%\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "name_map = {\n", + " \"a\": \"OpenAI Functions Agent\",\n", + " \"b\": \"Structured Chat Agent\",\n", + "}\n", + "counts = Counter(preferences)\n", + "pref_ratios = {\n", + " k: v/len(preferences) for k, v in\n", + " counts.items()\n", + "}\n", + "for k, v in pref_ratios.items():\n", + " print(f\"{name_map.get(k)}: {v:.2%}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Estimate Confidence Intervals\n", + "\n", + "The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n", + "\n", + "Below, use the Wilson score to estimate the confidence interval." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from math import sqrt\n", + "\n", + "def wilson_score_interval(preferences: list, which: str = \"a\", z: float = 1.96) -> tuple:\n", + " \"\"\"Estimate the confidence interval using the Wilson score.\n", + " \n", + " See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n", + " for more details, including when to use it and when it should not be used.\n", + " \"\"\"\n", + " total_preferences = preferences.count('a') + preferences.count('b')\n", + " n_s = preferences.count(which)\n", + "\n", + " if total_preferences == 0:\n", + " return (0, 0)\n", + "\n", + " p_hat = n_s / total_preferences\n", + "\n", + " denominator = 1 + (z**2) / total_preferences\n", + " adjustment = (z / denominator) * sqrt(p_hat*(1-p_hat)/total_preferences + (z**2)/(4*total_preferences*total_preferences))\n", + " center = (p_hat + (z**2) / (2*total_preferences)) / denominator\n", + " lower_bound = min(max(center - adjustment, 0.0), 1.0)\n", + " upper_bound = min(max(center + adjustment, 0.0), 1.0)\n", + "\n", + " return (lower_bound, upper_bound)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The \"OpenAI Functions Agent\" would be preferred between 69.90% and 97.21% percent of the time (with 95% confidence).\n", + "The \"Structured Chat Agent\" would be preferred between 2.79% and 30.10% percent of the time (with 95% confidence).\n" + ] + } + ], + "source": [ + "for which_, name in name_map.items():\n", + " low, high = wilson_score_interval(preferences, which=which_)\n", + " print(f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Print out the p-value.**" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The p-value is 0.00040. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n", + "then there is a 0.04025% chance of observing the OpenAI Functions Agent be preferred at least 18\n", + "times out of 20 trials.\n" + ] + } + ], + "source": [ + "from scipy import stats\n", + "preferred_model = max(pref_ratios, key=pref_ratios.get)\n", + "successes = preferences.count(preferred_model)\n", + "n = len(preferences) - preferences.count(None)\n", + "p_value = stats.binom_test(successes, n, p=0.5, alternative='two-sided')\n", + "print(f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n", + "then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n", + "times out of {n} trials.\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n", + "LLM preferences exhibit biases, including banal ones like the order of outputs.\n", + "In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py index 4714192ab27..1d88ae4b0e1 100644 --- a/langchain/evaluation/__init__.py +++ b/langchain/evaluation/__init__.py @@ -1 +1,35 @@ -"""[BETA] Functionality relating to evaluation.""" +"""Functionality relating to evaluation. + +This module contains off-the-shelf evaluation chains for +grading the output of LangChain primitives such as LLMs and Chains. + +Some common use cases for evaluation include: + +- Grading accuracy of a response against ground truth answers: QAEvalChain +- Comparing the output of two models: PairwiseStringEvalChain +- Judging the efficacy of an agent's tool usage: TrajectoryEvalChain +- Checking whether an output complies with a set of criteria: CriteriaEvalChain + +This module also contains low level APIs for making more evaluators for your +custom evaluation task. These include: +- StringEvaluator: Evaluates an output string against a reference and/or + with input context. +- PairwiseStringEvaluator: Evaluates two strings against each other. +""" + +from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain +from langchain.evaluation.comparison import PairwiseStringEvalChain +from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain +from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain +from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator + +__all__ = [ + "PairwiseStringEvalChain", + "QAEvalChain", + "CotQAEvalChain", + "ContextQAEvalChain", + "StringEvaluator", + "PairwiseStringEvaluator", + "TrajectoryEvalChain", + "CriteriaEvalChain", +] diff --git a/langchain/evaluation/comparison/__init__.py b/langchain/evaluation/comparison/__init__.py new file mode 100644 index 00000000000..3d84c8a267f --- /dev/null +++ b/langchain/evaluation/comparison/__init__.py @@ -0,0 +1,34 @@ +"""Comparison evaluators. + +This module contains evaluators for comparing the output of two models, +be they LLMs, Chains, or otherwise. This can be used for scoring +preferences, measuring similarity / semantic equivalence between outputs, +or any other comparison task. + +Example: + >>> from langchain.chat_models import ChatOpenAI + >>> from langchain.evaluation.comparison import PairwiseStringEvalChain + >>> llm = ChatOpenAI(temperature=0) + >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) + >>> result = chain.evaluate_string_pairs( + ... input = "What is the chemical formula for water?", + ... output_a = "H2O", + ... output_b = ( + ... "The chemical formula for water is H2O, which means" + ... " there are two hydrogen atoms and one oxygen atom." + ... referenc = "The chemical formula for water is H2O.", + ... ) + >>> print(result["text"]) + # { + # "value": "B", + # "comment": "Both responses accurately state" + # " that the chemical formula for water is H2O." + # " However, Response B provides additional information" + # . " by explaining what the formula means.\n[[B]]" + # } +""" +from langchain.evaluation.comparison.eval_chain import ( + PairwiseStringEvalChain, +) + +__all__ = ["PairwiseStringEvalChain"] diff --git a/langchain/evaluation/comparison/eval_chain.py b/langchain/evaluation/comparison/eval_chain.py new file mode 100644 index 00000000000..f8f13605877 --- /dev/null +++ b/langchain/evaluation/comparison/eval_chain.py @@ -0,0 +1,205 @@ +"""Base classes for comparing the output of two models.""" +from __future__ import annotations + +from typing import Any, Optional + +from pydantic import Field + +from langchain.base_language import BaseLanguageModel +from langchain.callbacks.manager import Callbacks +from langchain.chains.llm import LLMChain +from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE +from langchain.prompts.prompt import PromptTemplate +from langchain.schema import BaseOutputParser + + +class PairwiseStringResultOutputParser(BaseOutputParser[dict]): + """A parser for the output of the PairwiseStringEvalChain.""" + + @property + def _type(self) -> str: + return "pairwise_string_result" + + def parse(self, text: str) -> Any: + """Parse the output text. + + Args: + text (str): The output text to parse. + + Returns: + Any: The parsed output. + """ + reasoning, verdict = text.strip().rsplit("\n", maxsplit=1) + verdict = verdict.strip("[").strip("]") + if verdict not in {"A", "B", "C"}: + raise ValueError( + f"Invalid verdict: {verdict}. " + "Verdict must be one of 'A', 'B', or 'C'." + ) + # C means the models are tied. Return 'None' meaning no preference + verdict_ = None if verdict == "C" else verdict + score = { + "A": 1, + "B": 0, + None: 0.5, + }.get(verdict_) + return { + "reasoning": reasoning, + "value": verdict_, + "score": score, + } + + +class PairwiseStringEvalChain(LLMChain): + """A chain for comparing the output of two models. + + Example: + >>> from langchain.chat_models import ChatOpenAI + >>> from langchain.evaluation.comparison import PairwiseStringEvalChain + >>> llm = ChatOpenAI(temperature=0) + >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) + >>> result = chain.evaluate_string_pairs( + ... input = "What is the chemical formula for water?", + ... output_a = "H2O", + ... output_b = ( + ... "The chemical formula for water is H2O, which means" + ... " there are two hydrogen atoms and one oxygen atom." + ... referenc = "The chemical formula for water is H2O.", + ... ) + >>> print(result["text"]) + # { + # "value": "B", + # "comment": "Both responses accurately state" + # " that the chemical formula for water is H2O." + # " However, Response B provides additional information" + # . " by explaining what the formula means.\n[[B]]" + # } + """ + + output_parser: BaseOutputParser = Field( + default_factory=PairwiseStringResultOutputParser + ) + + @classmethod + def from_llm( + cls, + *, + llm: BaseLanguageModel, + prompt: Optional[PromptTemplate] = None, + require_reference: bool = False, + **kwargs: Any, + ) -> PairwiseStringEvalChain: + """Initialize the PairwiseStringEvalChain from an LLM. + + Args: + llm (BaseLanguageModel): The LLM to use. + prompt (PromptTemplate, optional): The prompt to use. + require_reference (bool, optional): Whether to require a reference + string. Defaults to False. + **kwargs (Any): Additional keyword arguments. + + Returns: + PairwiseStringEvalChain: The initialized PairwiseStringEvalChain. + """ + expected_input_vars = {"output_a", "output_b", "input"} + if prompt is None: + if require_reference: + expected_input_vars.add("reference") + prompt_ = PROMPT_WITH_REFERENCE + else: + prompt_ = PROMPT + else: + if require_reference: + expected_input_vars.add("reference") + prompt_ = prompt + + if expected_input_vars != set(prompt_.input_variables): + raise ValueError( + f"Input variables should be {expected_input_vars}, " + f"but got {prompt_.input_variables}" + ) + return cls(llm=llm, prompt=prompt_, **kwargs) + + def _prepare_input( + self, output_a: str, output_b: str, input: str, reference: Optional[str] + ) -> dict: + input_ = { + "output_a": output_a, + "output_b": output_b, + "input": input, + } + if reference is not None and "reference" in self.prompt.input_variables: + input_["reference"] = reference + return input_ + + def evaluate_string_pairs( + self, + *, + output_a: str, + output_b: str, + input: str, + reference: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """Evaluate whether output A is preferred to output B. + + Args: + output_a (str): The output string from the first model. + output_b (str): The output string from the second model. + input (str): The input or task string. + callbacks (Callbacks, optional): The callbacks to use. + reference (str, optional): The reference string, if any. + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - reasoning: The reasoning for the preference. + - value: The preference value, which is either 'A', 'B', or None + for no preference. + - score: The preference score, which is 1 for 'A', 0 for 'B', + and 0.5 for None. + """ + input_ = self._prepare_input(output_a, output_b, input, reference) + result = self( + inputs=input_, + callbacks=callbacks, + **kwargs, + ) + return result["text"] + + async def aevaluate_string_pairs( + self, + *, + output_a: str, + output_b: str, + input: str, + reference: Optional[str] = None, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate whether output A is preferred to output B. + + Args: + output_a (str): The output string from the first model. + output_b (str): The output string from the second model. + input (str): The input or task string. + callbacks (Callbacks, optional): The callbacks to use. + reference (str, optional): The reference string, if any. + **kwargs (Any): Additional keyword arguments. + + Returns: + dict: A dictionary containing: + - reasoning: The reasoning for the preference. + - value: The preference value, which is either 'A', 'B', or None + for no preference. + - score: The preference score, which is 1 for 'A', 0 for 'B', + and 0.5 for None. + """ + input_ = self._prepare_input(output_a, output_b, input, reference) + result = await self.acall( + inputs=input_, + callbacks=callbacks, + **kwargs, + ) + return result["text"] diff --git a/langchain/evaluation/comparison/prompt.py b/langchain/evaluation/comparison/prompt.py new file mode 100644 index 00000000000..15f9b60569a --- /dev/null +++ b/langchain/evaluation/comparison/prompt.py @@ -0,0 +1,64 @@ +"""Prompts for comparing the outputs of two models for a given question. + +This prompt is used to compare two responses and evaluate which one best follows the instructions +and answers the question. The prompt is based on the paper from +Zheng, et. al. https://arxiv.org/abs/2306.05685 +""" +# flake8: noqa +from langchain.prompts import PromptTemplate + +template = """Act as a fair judge and rate the two responses to the question below.\ + Choose the response that best followed the instructions and answered the question.\ + Your assessment should weigh helpfulness, relevance, accuracy, depth, creativity, and detail.\ + Start by comparing both responses and give a brief rationale.\ + Avoid bias from the order of presentation or response length. +After giving your rationale, make your final decision using this format:\ + "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\ + and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line. + +[QUESTION] +{input} +[/QUESTION] + +[RESPONSE A] +{output_a} +[/RESPONSE A] + +[RESPONSE B] +{output_b} +[/RESPONSE B]""" +PROMPT = PromptTemplate( + input_variables=["input", "output_a", "output_b"], template=template +) + +template = """Act as a fair judge and rate the two responses to the question below.\ + Choose the response that best followed the instructions and answered the question.\ + Your assessment should weigh helpfulness, relevance, accuracy, depth, creativity, and detail.\ + Start by comparing both responses and give a brief rationale.\ + Avoid bias from the order of presentation or response length.\ + Weigh accuracy based on the following ground truth reference\ + answer to the question: + +[REFERENCE] +{reference} +[/REFERENCE] + +After giving your rationale, make your final decision using this format:\ + "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\ + and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line. + +[QUESTION] +{input} +[/QUESTION] + +[RESPONSE A] +{output_a} +[/RESPONSE A] + +[RESPONSE B] +{output_b} +[/RESPONSE B]""" + +PROMPT_WITH_REFERENCE = PromptTemplate( + input_variables=["input", "output_a", "output_b", "reference"], template=template +) diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index b9a1231c1d2..25489eebb0a 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -14,7 +14,7 @@ class StringEvaluator(Protocol): prediction: str, reference: Optional[str] = None, input: Optional[str] = None, - **kwargs: Any + **kwargs: Any, ) -> dict: """Evaluate Chain or LLM output, based on optional input and label. @@ -34,7 +34,7 @@ class StringEvaluator(Protocol): prediction: str, reference: Optional[str] = None, input: Optional[str] = None, - **kwargs: Any + **kwargs: Any, ) -> dict: """Asynchronously evaluate Chain or LLM output, based on optional input and label. @@ -48,6 +48,66 @@ class StringEvaluator(Protocol): Returns: dict: The evaluation results containing the score or value. """ - return self.evaluate_strings( - prediction=prediction, reference=reference, input=input, **kwargs + raise NotImplementedError( + f"{self.__class__.__name__} hasn't implemented an " + "async aevaluate_strings method." + ) + + +@runtime_checkable +class PairwiseStringEvaluator(Protocol): + """A protocol for comparing the output of two models.""" + + @abstractmethod + def evaluate_string_pairs( + self, + *, + output_a: str, + output_b: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate the output string pairs. + + Args: + output_a (str): The output string from the first model. + output_b (str): The output string from the second model. + reference (str, optional): The expected output / reference + string. Defaults to None. + input (str, optional): The input string. Defaults to None. + **kwargs (Any): Additional keyword arguments, such + as callbacks and optional reference strings. + + Returns: + dict: A dictionary containing the preference, scores, and/or + other information. + """ + + async def aevaluate_string_pairs( + self, + output_a: str, + output_b: str, + reference: Optional[str] = None, + input: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate the output string pairs. + + Args: + output_a (str): The output string from the first model. + output_b (str): The output string from the second model. + reference (str, optional): The expected output / reference + string. Defaults to None. + input (str, optional): The input string. Defaults to None. + **kwargs (Any): Additional keyword arguments, such + as callbacks and optional reference strings. + + Returns: + dict: A dictionary containing the preference, scores, and/or + other information. + """ + raise NotImplementedError( + f"{self.__class__.__name__} hasn't implemented an async " + "aevaluate_string_pairs method." ) diff --git a/tests/unit_tests/evaluation/comparison/__init__.py b/tests/unit_tests/evaluation/comparison/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/tests/unit_tests/evaluation/comparison/test_eval_chain.py new file mode 100644 index 00000000000..9cf4ca8c670 --- /dev/null +++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py @@ -0,0 +1,39 @@ +"""Test the comparison chains.""" + + +from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain +from tests.unit_tests.llms.fake_llm import FakeLLM + + +def test_pairwise_string_comparison_chain() -> None: + llm = FakeLLM( + queries={ + "a": "The values are the same.\n[[C]]", + "b": "A is clearly better than b.\n[[A]]", + "c": "B is clearly better than a.\n[[B]]", + }, + sequential_responses=True, + ) + chain = PairwiseStringEvalChain.from_llm(llm=llm) + res = chain.evaluate_string_pairs( + output_a="I like pie.", + output_b="I love pie.", + input="What is your favorite food?", + ) + assert res["value"] is None + assert res["score"] == 0.5 + assert res["reasoning"] == "The values are the same." + res = chain.evaluate_string_pairs( + output_a="I like pie.", + output_b="I like pie.", + input="What is your favorite food?", + ) + assert res["value"] == "A" + assert res["score"] == 1 + res = chain.evaluate_string_pairs( + output_a="I like pie.", + output_b="I hate pie.", + input="What is your favorite food?", + ) + assert res["value"] == "B" + assert res["score"] == 0