From 8c73037dff117ef059195443d04257b986372a5a Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Fri, 30 Jun 2023 07:47:53 -0700
Subject: [PATCH] Simplify eval arg names (#6944)

It'll be easier to switch between these if the names of predictions are
consistent
---
 .../guides/evaluation/comparisons.ipynb       |  4 +--
 .../agents/trajectory_eval_chain.py           | 14 ++++----
 langchain/evaluation/comparison/__init__.py   |  4 +--
 langchain/evaluation/comparison/eval_chain.py | 32 +++++++++----------
 langchain/evaluation/comparison/prompt.py     | 13 ++++----
 langchain/evaluation/schema.py                | 16 +++++-----
 .../evaluation/agents/test_eval_chain.py      |  8 ++---
 .../evaluation/comparison/test_eval_chain.py  | 12 +++----
 8 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb
index 5ee05161867..c59c5cb0bc6 100644
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -243,8 +243,8 @@
     "            pred_a, pred_b = res_b, res_a\n",
     "            a, b = \"b\", \"a\"\n",
     "        eval_res = eval_chain.evaluate_string_pairs(\n",
-    "            output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
-    "            output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
+    "            prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
+    "            prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
     "            input=input_\n",
     "        )\n",
     "        if eval_res[\"value\"] == \"A\":\n",
diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py
index 8d4f837def4..184bcbfcee4 100644
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
             result = eval_chain.evaluate_agent_trajectory(
                 input=question,
                 agent_trajectory=response["intermediate_steps"],
-                output=response["output"],
+                prediction=response["output"],
                 reference="Paris",
             )
             print(result["score"])
@@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
     def evaluate_agent_trajectory(
         self,
         *,
+        prediction: str,
         input: str,
         agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
-        output: str,
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
         **kwargs: Any,
@@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
             input (str): The input question.
             agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
                 The intermediate steps forming the agent trajectory.
-            output (str): The expected output.
+            prediction (str): The expected prediction.
             reference (Optional[str]): The reference answer.
 
         Returns:
@@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
         inputs = {
             "question": input,
             "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
-            "answer": output,
+            "answer": prediction,
             "reference": self._format_reference(reference),
         }
         return self(inputs=inputs, callbacks=callbacks, **kwargs)
@@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
     async def aevaluate_agent_trajectory(
         self,
         *,
+        prediction: str,
         input: str,
         agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
-        output: str,
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
         **kwargs: Any,
@@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
             input (str): The input question.
             agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
                 The intermediate steps forming the agent trajectory.
-            output (str): The expected output.
+            prediction (str): The expected prediction.
             reference (Optional[str]): The reference answer.
 
         Returns:
@@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
         inputs = {
             "question": input,
             "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
-            "answer": output,
+            "answer": prediction,
             "reference": self._format_reference(reference),
         }
         return await self.acall(
diff --git a/langchain/evaluation/comparison/__init__.py b/langchain/evaluation/comparison/__init__.py
index 3d84c8a267f..50cd5156896 100644
--- a/langchain/evaluation/comparison/__init__.py
+++ b/langchain/evaluation/comparison/__init__.py
@@ -12,8 +12,8 @@ Example:
     >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
     >>> result = chain.evaluate_string_pairs(
     ...     input = "What is the chemical formula for water?",
-    ...     output_a = "H2O",
-    ...     output_b = (
+    ...     prediction = "H2O",
+    ...     prediction_b = (
     ...        "The chemical formula for water is H2O, which means"
     ...        " there are two hydrogen atoms and one oxygen atom."
     ...     referenc = "The chemical formula for water is H2O.",
diff --git a/langchain/evaluation/comparison/eval_chain.py b/langchain/evaluation/comparison/eval_chain.py
index f8f13605877..3c3f4f666bf 100644
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
     >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
     >>> result = chain.evaluate_string_pairs(
     ...     input = "What is the chemical formula for water?",
-    ...     output_a = "H2O",
-    ...     output_b = (
+    ...     prediction = "H2O",
+    ...     prediction_b = (
     ...        "The chemical formula for water is H2O, which means"
     ...        " there are two hydrogen atoms and one oxygen atom."
     ...     referenc = "The chemical formula for water is H2O.",
@@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
         Returns:
             PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
         """
-        expected_input_vars = {"output_a", "output_b", "input"}
+        expected_input_vars = {"prediction", "prediction_b", "input"}
         if prompt is None:
             if require_reference:
                 expected_input_vars.add("reference")
@@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
         return cls(llm=llm, prompt=prompt_, **kwargs)
 
     def _prepare_input(
-        self, output_a: str, output_b: str, input: str, reference: Optional[str]
+        self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
     ) -> dict:
         input_ = {
-            "output_a": output_a,
-            "output_b": output_b,
+            "prediction": prediction,
+            "prediction_b": prediction_b,
             "input": input,
         }
         if reference is not None and "reference" in self.prompt.input_variables:
@@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
     def evaluate_string_pairs(
         self,
         *,
-        output_a: str,
-        output_b: str,
+        prediction: str,
+        prediction_b: str,
         input: str,
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
@@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
         """Evaluate whether output A is preferred to output B.
 
         Args:
-            output_a (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
             input (str): The input or task string.
             callbacks (Callbacks, optional): The callbacks to use.
             reference (str, optional): The reference string, if any.
@@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
                 - score: The preference score, which is 1 for 'A', 0 for 'B',
                     and 0.5 for None.
         """
-        input_ = self._prepare_input(output_a, output_b, input, reference)
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
         result = self(
             inputs=input_,
             callbacks=callbacks,
@@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
     async def aevaluate_string_pairs(
         self,
         *,
-        output_a: str,
-        output_b: str,
+        prediction: str,
+        prediction_b: str,
         input: str,
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
@@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
         """Asynchronously evaluate whether output A is preferred to output B.
 
         Args:
-            output_a (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
             input (str): The input or task string.
             callbacks (Callbacks, optional): The callbacks to use.
             reference (str, optional): The reference string, if any.
@@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
                 - score: The preference score, which is 1 for 'A', 0 for 'B',
                     and 0.5 for None.
         """
-        input_ = self._prepare_input(output_a, output_b, input, reference)
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
         result = await self.acall(
             inputs=input_,
             callbacks=callbacks,
diff --git a/langchain/evaluation/comparison/prompt.py b/langchain/evaluation/comparison/prompt.py
index 15f9b60569a..f5d9846495c 100644
--- a/langchain/evaluation/comparison/prompt.py
+++ b/langchain/evaluation/comparison/prompt.py
@@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
 [/QUESTION]
 
 [RESPONSE A]
-{output_a}
+{prediction}
 [/RESPONSE A]
 
 [RESPONSE B]
-{output_b}
+{prediction_b}
 [/RESPONSE B]"""
 PROMPT = PromptTemplate(
-    input_variables=["input", "output_a", "output_b"], template=template
+    input_variables=["input", "prediction", "prediction_b"], template=template
 )
 
 template = """Act as a fair judge and rate the two responses to the question below.\
@@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
 [/QUESTION]
 
 [RESPONSE A]
-{output_a}
+{prediction}
 [/RESPONSE A]
 
 [RESPONSE B]
-{output_b}
+{prediction_b}
 [/RESPONSE B]"""
 
 PROMPT_WITH_REFERENCE = PromptTemplate(
-    input_variables=["input", "output_a", "output_b", "reference"], template=template
+    input_variables=["input", "prediction", "prediction_b", "reference"],
+    template=template,
 )
diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py
index 25489eebb0a..8c3362088ed 100644
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
     def evaluate_string_pairs(
         self,
         *,
-        output_a: str,
-        output_b: str,
+        prediction: str,
+        prediction_b: str,
         reference: Optional[str] = None,
         input: Optional[str] = None,
         **kwargs: Any,
@@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
         """Evaluate the output string pairs.
 
         Args:
-            output_a (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
             reference (str, optional): The expected output / reference
                 string. Defaults to None.
             input (str, optional): The input string. Defaults to None.
@@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
 
     async def aevaluate_string_pairs(
         self,
-        output_a: str,
-        output_b: str,
+        prediction: str,
+        prediction_b: str,
         reference: Optional[str] = None,
         input: Optional[str] = None,
         **kwargs: Any,
@@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
         """Evaluate the output string pairs.
 
         Args:
-            output_a (str): The output string from the first model.
-            output_b (str): The output string from the second model.
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
             reference (str, optional): The expected output / reference
                 string. Defaults to None.
             input (str, optional): The input string. Defaults to None.
diff --git a/tests/unit_tests/evaluation/agents/test_eval_chain.py b/tests/unit_tests/evaluation/agents/test_eval_chain.py
index 3a82f073dc2..59fa3de0173 100644
--- a/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py
@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
     res = chain.evaluate_agent_trajectory(
         input="What is your favorite food?",
         agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
     )
     assert res["score"] == 5
     # Test when ref is provided
     res = chain.evaluate_agent_trajectory(
         input="What is your favorite food?",
         agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
         reference="Paris",
     )
     assert res["score"] == 1
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
     res = chain.evaluate_agent_trajectory(
         input="What is your favorite food?",
         agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
     )
     assert res["score"] == 5
     res = chain.evaluate_agent_trajectory(
         input="What is your favorite food?",
         agent_trajectory=intermediate_steps,
-        output="I like pie.",
+        prediction="I like pie.",
         reference="Paris",
     )
     assert res["score"] == 1
diff --git a/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
index 9cf4ca8c670..4a96b43e18c 100644
--- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
     )
     chain = PairwiseStringEvalChain.from_llm(llm=llm)
     res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
-        output_b="I love pie.",
+        prediction="I like pie.",
+        prediction_b="I love pie.",
         input="What is your favorite food?",
     )
     assert res["value"] is None
     assert res["score"] == 0.5
     assert res["reasoning"] == "The values are the same."
     res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
-        output_b="I like pie.",
+        prediction="I like pie.",
+        prediction_b="I like pie.",
         input="What is your favorite food?",
     )
     assert res["value"] == "A"
     assert res["score"] == 1
     res = chain.evaluate_string_pairs(
-        output_a="I like pie.",
-        output_b="I hate pie.",
+        prediction="I like pie.",
+        prediction_b="I hate pie.",
         input="What is your favorite food?",
     )
     assert res["value"] == "B"