Simplify eval arg names (#6944)

It'll be easier to switch between these if the names of predictions are consistent
2025-09-10 07:21:03 +00:00 · 2023-06-30 07:47:53 -07:00
parent 8f5eca236f
commit 8c73037dff
8 changed files with 52 additions and 51 deletions
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -243,8 +243,8 @@
    "            pred_a, pred_b = res_b, res_a\n",
    "            a, b = \"b\", \"a\"\n",
    "        eval_res = eval_chain.evaluate_string_pairs(\n",
-    "            output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
+    "            prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
-    "            output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
+    "            prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
    "            input=input_\n",
    "        )\n",
    "        if eval_res[\"value\"] == \"A\":\n",
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
            result = eval_chain.evaluate_agent_trajectory(
                input=question,
                agent_trajectory=response["intermediate_steps"],
-                output=response["output"],
+                prediction=response["output"],
                reference="Paris",
            )
            print(result["score"])
@@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
    def evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
        output: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
            input (str): The input question.
            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
                The intermediate steps forming the agent trajectory.
-            output (str): The expected output.
+            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
        Returns:
@@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
        inputs = {
            "question": input,
            "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
-            "answer": output,
+            "answer": prediction,
            "reference": self._format_reference(reference),
        }
        return self(inputs=inputs, callbacks=callbacks, **kwargs)
@@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
    async def aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
        output: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
            input (str): The input question.
            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
                The intermediate steps forming the agent trajectory.
-            output (str): The expected output.
+            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
        Returns:
@@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
        inputs = {
            "question": input,
            "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
-            "answer": output,
+            "answer": prediction,
            "reference": self._format_reference(reference),
        }
        return await self.acall(
--- a/langchain/evaluation/comparison/init.py
+++ b/langchain/evaluation/comparison/init.py
@@ -12,8 +12,8 @@ Example:
    >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
    >>> result = chain.evaluate_string_pairs(
    ...     input = "What is the chemical formula for water?",
-    ...     output_a = "H2O",
+    ...     prediction = "H2O",
-    ...     output_b = (
+    ...     prediction_b = (
    ...        "The chemical formula for water is H2O, which means"
    ...        " there are two hydrogen atoms and one oxygen atom."
    ...     referenc = "The chemical formula for water is H2O.",
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
    >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
    >>> result = chain.evaluate_string_pairs(
    ...     input = "What is the chemical formula for water?",
-    ...     output_a = "H2O",
+    ...     prediction = "H2O",
-    ...     output_b = (
+    ...     prediction_b = (
    ...        "The chemical formula for water is H2O, which means"
    ...        " there are two hydrogen atoms and one oxygen atom."
    ...     referenc = "The chemical formula for water is H2O.",
@@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
        Returns:
            PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
        """
-        expected_input_vars = {"output_a", "output_b", "input"}
+        expected_input_vars = {"prediction", "prediction_b", "input"}
        if prompt is None:
            if require_reference:
                expected_input_vars.add("reference")
@@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
        return cls(llm=llm, prompt=prompt_, **kwargs)
    def _prepare_input(
-        self, output_a: str, output_b: str, input: str, reference: Optional[str]
+        self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
    ) -> dict:
        input_ = {
-            "output_a": output_a,
+            "prediction": prediction,
-            "output_b": output_b,
+            "prediction_b": prediction_b,
            "input": input,
        }
        if reference is not None and "reference" in self.prompt.input_variables:
@@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
    def evaluate_string_pairs(
        self,
        *,
-        output_a: str,
+        prediction: str,
-        output_b: str,
+        prediction_b: str,
        input: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
@@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
        """Evaluate whether output A is preferred to output B.
        Args:
-            output_a (str): The output string from the first model.
+            prediction (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction_b (str): The output string from the second model.
            input (str): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
@@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.
        """
-        input_ = self._prepare_input(output_a, output_b, input, reference)
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
        result = self(
            inputs=input_,
            callbacks=callbacks,
@@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
    async def aevaluate_string_pairs(
        self,
        *,
-        output_a: str,
+        prediction: str,
-        output_b: str,
+        prediction_b: str,
        input: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
@@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
        """Asynchronously evaluate whether output A is preferred to output B.
        Args:
-            output_a (str): The output string from the first model.
+            prediction (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction_b (str): The output string from the second model.
            input (str): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
@@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.
        """
-        input_ = self._prepare_input(output_a, output_b, input, reference)
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
        result = await self.acall(
            inputs=input_,
            callbacks=callbacks,
--- a/langchain/evaluation/comparison/prompt.py
+++ b/langchain/evaluation/comparison/prompt.py
@@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
 [/QUESTION]
 [RESPONSE A]
-{output_a}
+{prediction}
 [/RESPONSE A]
 [RESPONSE B]
-{output_b}
+{prediction_b}
 [/RESPONSE B]"""
 PROMPT = PromptTemplate(
-    input_variables=["input", "output_a", "output_b"], template=template
+    input_variables=["input", "prediction", "prediction_b"], template=template
 )
 template = """Act as a fair judge and rate the two responses to the question below.\
@@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
 [/QUESTION]
 [RESPONSE A]
-{output_a}
+{prediction}
 [/RESPONSE A]
 [RESPONSE B]
-{output_b}
+{prediction_b}
 [/RESPONSE B]"""
 PROMPT_WITH_REFERENCE = PromptTemplate(
-    input_variables=["input", "output_a", "output_b", "reference"], template=template
+    input_variables=["input", "prediction", "prediction_b", "reference"],
    template=template,
 )
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
    def evaluate_string_pairs(
        self,
        *,
-        output_a: str,
+        prediction: str,
-        output_b: str,
+        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
@@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
        """Evaluate the output string pairs.
        Args:
-            output_a (str): The output string from the first model.
+            prediction (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction_b (str): The output string from the second model.
            reference (str, optional): The expected output / reference
                string. Defaults to None.
            input (str, optional): The input string. Defaults to None.
@@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
    async def aevaluate_string_pairs(
        self,
-        output_a: str,
+        prediction: str,
-        output_b: str,
+        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
@@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
        """Evaluate the output string pairs.
        Args:
-            output_a (str): The output string from the first model.
+            prediction (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction_b (str): The output string from the second model.
            reference (str, optional): The expected output / reference
                string. Defaults to None.
            input (str, optional): The input string. Defaults to None.
--- a/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py
@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
    res = chain.evaluate_agent_trajectory(
        input="What is your favorite food?",
        agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
    )
    assert res["score"] == 5
    # Test when ref is provided
    res = chain.evaluate_agent_trajectory(
        input="What is your favorite food?",
        agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
        reference="Paris",
    )
    assert res["score"] == 1
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
    res = chain.evaluate_agent_trajectory(
        input="What is your favorite food?",
        agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
    )
    assert res["score"] == 5
    res = chain.evaluate_agent_trajectory(
        input="What is your favorite food?",
        agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
        reference="Paris",
    )
    assert res["score"] == 1
--- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
    )
    chain = PairwiseStringEvalChain.from_llm(llm=llm)
    res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
+        prediction="I like pie.",
-        output_b="I love pie.",
+        prediction_b="I love pie.",
        input="What is your favorite food?",
    )
    assert res["value"] is None
    assert res["score"] == 0.5
    assert res["reasoning"] == "The values are the same."
    res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
+        prediction="I like pie.",
-        output_b="I like pie.",
+        prediction_b="I like pie.",
        input="What is your favorite food?",
    )
    assert res["value"] == "A"
    assert res["score"] == 1
    res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
+        prediction="I like pie.",
-        output_b="I hate pie.",
+        prediction_b="I hate pie.",
        input="What is your favorite food?",
    )
    assert res["value"] == "B"