From 8c73037dff117ef059195443d04257b986372a5a Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Fri, 30 Jun 2023 07:47:53 -0700 Subject: [PATCH] Simplify eval arg names (#6944) It'll be easier to switch between these if the names of predictions are consistent --- .../guides/evaluation/comparisons.ipynb | 4 +-- .../agents/trajectory_eval_chain.py | 14 ++++---- langchain/evaluation/comparison/__init__.py | 4 +-- langchain/evaluation/comparison/eval_chain.py | 32 +++++++++---------- langchain/evaluation/comparison/prompt.py | 13 ++++---- langchain/evaluation/schema.py | 16 +++++----- .../evaluation/agents/test_eval_chain.py | 8 ++--- .../evaluation/comparison/test_eval_chain.py | 12 +++---- 8 files changed, 52 insertions(+), 51 deletions(-) diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb index 5ee05161867..c59c5cb0bc6 100644 --- a/docs/extras/guides/evaluation/comparisons.ipynb +++ b/docs/extras/guides/evaluation/comparisons.ipynb @@ -243,8 +243,8 @@ " pred_a, pred_b = res_b, res_a\n", " a, b = \"b\", \"a\"\n", " eval_res = eval_chain.evaluate_string_pairs(\n", - " output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n", - " output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n", + " prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n", + " prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n", " input=input_\n", " )\n", " if eval_res[\"value\"] == \"A\":\n", diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py index 8d4f837def4..184bcbfcee4 100644 --- a/langchain/evaluation/agents/trajectory_eval_chain.py +++ b/langchain/evaluation/agents/trajectory_eval_chain.py @@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain): result = eval_chain.evaluate_agent_trajectory( input=question, agent_trajectory=response["intermediate_steps"], - output=response["output"], + prediction=response["output"], reference="Paris", ) print(result["score"]) @@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness: def evaluate_agent_trajectory( self, *, + prediction: str, input: str, agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], - output: str, reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, @@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness: input (str): The input question. agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): The intermediate steps forming the agent trajectory. - output (str): The expected output. + prediction (str): The expected prediction. reference (Optional[str]): The reference answer. Returns: @@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness: inputs = { "question": input, "agent_trajectory": self.get_agent_trajectory(agent_trajectory), - "answer": output, + "answer": prediction, "reference": self._format_reference(reference), } return self(inputs=inputs, callbacks=callbacks, **kwargs) @@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness: async def aevaluate_agent_trajectory( self, *, + prediction: str, input: str, agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], - output: str, reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, @@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness: input (str): The input question. agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): The intermediate steps forming the agent trajectory. - output (str): The expected output. + prediction (str): The expected prediction. reference (Optional[str]): The reference answer. Returns: @@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness: inputs = { "question": input, "agent_trajectory": self.get_agent_trajectory(agent_trajectory), - "answer": output, + "answer": prediction, "reference": self._format_reference(reference), } return await self.acall( diff --git a/langchain/evaluation/comparison/__init__.py b/langchain/evaluation/comparison/__init__.py index 3d84c8a267f..50cd5156896 100644 --- a/langchain/evaluation/comparison/__init__.py +++ b/langchain/evaluation/comparison/__init__.py @@ -12,8 +12,8 @@ Example: >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_string_pairs( ... input = "What is the chemical formula for water?", - ... output_a = "H2O", - ... output_b = ( + ... prediction = "H2O", + ... prediction_b = ( ... "The chemical formula for water is H2O, which means" ... " there are two hydrogen atoms and one oxygen atom." ... referenc = "The chemical formula for water is H2O.", diff --git a/langchain/evaluation/comparison/eval_chain.py b/langchain/evaluation/comparison/eval_chain.py index f8f13605877..3c3f4f666bf 100644 --- a/langchain/evaluation/comparison/eval_chain.py +++ b/langchain/evaluation/comparison/eval_chain.py @@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain): >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_string_pairs( ... input = "What is the chemical formula for water?", - ... output_a = "H2O", - ... output_b = ( + ... prediction = "H2O", + ... prediction_b = ( ... "The chemical formula for water is H2O, which means" ... " there are two hydrogen atoms and one oxygen atom." ... referenc = "The chemical formula for water is H2O.", @@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain): Returns: PairwiseStringEvalChain: The initialized PairwiseStringEvalChain. """ - expected_input_vars = {"output_a", "output_b", "input"} + expected_input_vars = {"prediction", "prediction_b", "input"} if prompt is None: if require_reference: expected_input_vars.add("reference") @@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain): return cls(llm=llm, prompt=prompt_, **kwargs) def _prepare_input( - self, output_a: str, output_b: str, input: str, reference: Optional[str] + self, prediction: str, prediction_b: str, input: str, reference: Optional[str] ) -> dict: input_ = { - "output_a": output_a, - "output_b": output_b, + "prediction": prediction, + "prediction_b": prediction_b, "input": input, } if reference is not None and "reference" in self.prompt.input_variables: @@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain): def evaluate_string_pairs( self, *, - output_a: str, - output_b: str, + prediction: str, + prediction_b: str, input: str, reference: Optional[str] = None, callbacks: Callbacks = None, @@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain): """Evaluate whether output A is preferred to output B. Args: - output_a (str): The output string from the first model. - output_b (str): The output string from the second model. + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. input (str): The input or task string. callbacks (Callbacks, optional): The callbacks to use. reference (str, optional): The reference string, if any. @@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain): - score: The preference score, which is 1 for 'A', 0 for 'B', and 0.5 for None. """ - input_ = self._prepare_input(output_a, output_b, input, reference) + input_ = self._prepare_input(prediction, prediction_b, input, reference) result = self( inputs=input_, callbacks=callbacks, @@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain): async def aevaluate_string_pairs( self, *, - output_a: str, - output_b: str, + prediction: str, + prediction_b: str, input: str, reference: Optional[str] = None, callbacks: Callbacks = None, @@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain): """Asynchronously evaluate whether output A is preferred to output B. Args: - output_a (str): The output string from the first model. - output_b (str): The output string from the second model. + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. input (str): The input or task string. callbacks (Callbacks, optional): The callbacks to use. reference (str, optional): The reference string, if any. @@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain): - score: The preference score, which is 1 for 'A', 0 for 'B', and 0.5 for None. """ - input_ = self._prepare_input(output_a, output_b, input, reference) + input_ = self._prepare_input(prediction, prediction_b, input, reference) result = await self.acall( inputs=input_, callbacks=callbacks, diff --git a/langchain/evaluation/comparison/prompt.py b/langchain/evaluation/comparison/prompt.py index 15f9b60569a..f5d9846495c 100644 --- a/langchain/evaluation/comparison/prompt.py +++ b/langchain/evaluation/comparison/prompt.py @@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\ [/QUESTION] [RESPONSE A] -{output_a} +{prediction} [/RESPONSE A] [RESPONSE B] -{output_b} +{prediction_b} [/RESPONSE B]""" PROMPT = PromptTemplate( - input_variables=["input", "output_a", "output_b"], template=template + input_variables=["input", "prediction", "prediction_b"], template=template ) template = """Act as a fair judge and rate the two responses to the question below.\ @@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\ [/QUESTION] [RESPONSE A] -{output_a} +{prediction} [/RESPONSE A] [RESPONSE B] -{output_b} +{prediction_b} [/RESPONSE B]""" PROMPT_WITH_REFERENCE = PromptTemplate( - input_variables=["input", "output_a", "output_b", "reference"], template=template + input_variables=["input", "prediction", "prediction_b", "reference"], + template=template, ) diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index 25489eebb0a..8c3362088ed 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol): def evaluate_string_pairs( self, *, - output_a: str, - output_b: str, + prediction: str, + prediction_b: str, reference: Optional[str] = None, input: Optional[str] = None, **kwargs: Any, @@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol): """Evaluate the output string pairs. Args: - output_a (str): The output string from the first model. - output_b (str): The output string from the second model. + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. reference (str, optional): The expected output / reference string. Defaults to None. input (str, optional): The input string. Defaults to None. @@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol): async def aevaluate_string_pairs( self, - output_a: str, - output_b: str, + prediction: str, + prediction_b: str, reference: Optional[str] = None, input: Optional[str] = None, **kwargs: Any, @@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol): """Evaluate the output string pairs. Args: - output_a (str): The output string from the first model. - output_b (str): The output string from the second model. + prediction (str): The output string from the first model. + prediction_b (str): The output string from the second model. reference (str, optional): The expected output / reference string. Defaults to None. input (str, optional): The input string. Defaults to None. diff --git a/tests/unit_tests/evaluation/agents/test_eval_chain.py b/tests/unit_tests/evaluation/agents/test_eval_chain.py index 3a82f073dc2..59fa3de0173 100644 --- a/tests/unit_tests/evaluation/agents/test_eval_chain.py +++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py @@ -45,14 +45,14 @@ def test_trajectory_eval_chain( res = chain.evaluate_agent_trajectory( input="What is your favorite food?", agent_trajectory=intermediate_steps, - output="I like pie.", + prediction="I like pie.", ) assert res["score"] == 5 # Test when ref is provided res = chain.evaluate_agent_trajectory( input="What is your favorite food?", agent_trajectory=intermediate_steps, - output="I like pie.", + prediction="I like pie.", reference="Paris", ) assert res["score"] == 1 @@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools( res = chain.evaluate_agent_trajectory( input="What is your favorite food?", agent_trajectory=intermediate_steps, - output="I like pie.", + prediction="I like pie.", ) assert res["score"] == 5 res = chain.evaluate_agent_trajectory( input="What is your favorite food?", agent_trajectory=intermediate_steps, - output="I like pie.", + prediction="I like pie.", reference="Paris", ) assert res["score"] == 1 diff --git a/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/tests/unit_tests/evaluation/comparison/test_eval_chain.py index 9cf4ca8c670..4a96b43e18c 100644 --- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py +++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py @@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None: ) chain = PairwiseStringEvalChain.from_llm(llm=llm) res = chain.evaluate_string_pairs( - output_a="I like pie.", - output_b="I love pie.", + prediction="I like pie.", + prediction_b="I love pie.", input="What is your favorite food?", ) assert res["value"] is None assert res["score"] == 0.5 assert res["reasoning"] == "The values are the same." res = chain.evaluate_string_pairs( - output_a="I like pie.", - output_b="I like pie.", + prediction="I like pie.", + prediction_b="I like pie.", input="What is your favorite food?", ) assert res["value"] == "A" assert res["score"] == 1 res = chain.evaluate_string_pairs( - output_a="I like pie.", - output_b="I hate pie.", + prediction="I like pie.", + prediction_b="I hate pie.", input="What is your favorite food?", ) assert res["value"] == "B"