Simplify eval arg names (#6944)

It'll be easier to switch between these if the names of predictions are
consistent
This commit is contained in:
William FH
2023-06-30 07:47:53 -07:00
committed by GitHub
parent 8f5eca236f
commit 8c73037dff
8 changed files with 52 additions and 51 deletions

View File

@@ -243,8 +243,8 @@
" pred_a, pred_b = res_b, res_a\n", " pred_a, pred_b = res_b, res_a\n",
" a, b = \"b\", \"a\"\n", " a, b = \"b\", \"a\"\n",
" eval_res = eval_chain.evaluate_string_pairs(\n", " eval_res = eval_chain.evaluate_string_pairs(\n",
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n", " prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n", " prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
" input=input_\n", " input=input_\n",
" )\n", " )\n",
" if eval_res[\"value\"] == \"A\":\n", " if eval_res[\"value\"] == \"A\":\n",

View File

@@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
result = eval_chain.evaluate_agent_trajectory( result = eval_chain.evaluate_agent_trajectory(
input=question, input=question,
agent_trajectory=response["intermediate_steps"], agent_trajectory=response["intermediate_steps"],
output=response["output"], prediction=response["output"],
reference="Paris", reference="Paris",
) )
print(result["score"]) print(result["score"])
@@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
def evaluate_agent_trajectory( def evaluate_agent_trajectory(
self, self,
*, *,
prediction: str,
input: str, input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
output: str,
reference: Optional[str] = None, reference: Optional[str] = None,
callbacks: Callbacks = None, callbacks: Callbacks = None,
**kwargs: Any, **kwargs: Any,
@@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
input (str): The input question. input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
The intermediate steps forming the agent trajectory. The intermediate steps forming the agent trajectory.
output (str): The expected output. prediction (str): The expected prediction.
reference (Optional[str]): The reference answer. reference (Optional[str]): The reference answer.
Returns: Returns:
@@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
inputs = { inputs = {
"question": input, "question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory), "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": output, "answer": prediction,
"reference": self._format_reference(reference), "reference": self._format_reference(reference),
} }
return self(inputs=inputs, callbacks=callbacks, **kwargs) return self(inputs=inputs, callbacks=callbacks, **kwargs)
@@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
async def aevaluate_agent_trajectory( async def aevaluate_agent_trajectory(
self, self,
*, *,
prediction: str,
input: str, input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
output: str,
reference: Optional[str] = None, reference: Optional[str] = None,
callbacks: Callbacks = None, callbacks: Callbacks = None,
**kwargs: Any, **kwargs: Any,
@@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
input (str): The input question. input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
The intermediate steps forming the agent trajectory. The intermediate steps forming the agent trajectory.
output (str): The expected output. prediction (str): The expected prediction.
reference (Optional[str]): The reference answer. reference (Optional[str]): The reference answer.
Returns: Returns:
@@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
inputs = { inputs = {
"question": input, "question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory), "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": output, "answer": prediction,
"reference": self._format_reference(reference), "reference": self._format_reference(reference),
} }
return await self.acall( return await self.acall(

View File

@@ -12,8 +12,8 @@ Example:
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm) >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs( >>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?", ... input = "What is the chemical formula for water?",
... output_a = "H2O", ... prediction = "H2O",
... output_b = ( ... prediction_b = (
... "The chemical formula for water is H2O, which means" ... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom." ... " there are two hydrogen atoms and one oxygen atom."
... referenc = "The chemical formula for water is H2O.", ... referenc = "The chemical formula for water is H2O.",

View File

@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm) >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs( >>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?", ... input = "What is the chemical formula for water?",
... output_a = "H2O", ... prediction = "H2O",
... output_b = ( ... prediction_b = (
... "The chemical formula for water is H2O, which means" ... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom." ... " there are two hydrogen atoms and one oxygen atom."
... referenc = "The chemical formula for water is H2O.", ... referenc = "The chemical formula for water is H2O.",
@@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
Returns: Returns:
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain. PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
""" """
expected_input_vars = {"output_a", "output_b", "input"} expected_input_vars = {"prediction", "prediction_b", "input"}
if prompt is None: if prompt is None:
if require_reference: if require_reference:
expected_input_vars.add("reference") expected_input_vars.add("reference")
@@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
return cls(llm=llm, prompt=prompt_, **kwargs) return cls(llm=llm, prompt=prompt_, **kwargs)
def _prepare_input( def _prepare_input(
self, output_a: str, output_b: str, input: str, reference: Optional[str] self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
) -> dict: ) -> dict:
input_ = { input_ = {
"output_a": output_a, "prediction": prediction,
"output_b": output_b, "prediction_b": prediction_b,
"input": input, "input": input,
} }
if reference is not None and "reference" in self.prompt.input_variables: if reference is not None and "reference" in self.prompt.input_variables:
@@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
def evaluate_string_pairs( def evaluate_string_pairs(
self, self,
*, *,
output_a: str, prediction: str,
output_b: str, prediction_b: str,
input: str, input: str,
reference: Optional[str] = None, reference: Optional[str] = None,
callbacks: Callbacks = None, callbacks: Callbacks = None,
@@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
"""Evaluate whether output A is preferred to output B. """Evaluate whether output A is preferred to output B.
Args: Args:
output_a (str): The output string from the first model. prediction (str): The output string from the first model.
output_b (str): The output string from the second model. prediction_b (str): The output string from the second model.
input (str): The input or task string. input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use. callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any. reference (str, optional): The reference string, if any.
@@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
- score: The preference score, which is 1 for 'A', 0 for 'B', - score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None. and 0.5 for None.
""" """
input_ = self._prepare_input(output_a, output_b, input, reference) input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self( result = self(
inputs=input_, inputs=input_,
callbacks=callbacks, callbacks=callbacks,
@@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
async def aevaluate_string_pairs( async def aevaluate_string_pairs(
self, self,
*, *,
output_a: str, prediction: str,
output_b: str, prediction_b: str,
input: str, input: str,
reference: Optional[str] = None, reference: Optional[str] = None,
callbacks: Callbacks = None, callbacks: Callbacks = None,
@@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
"""Asynchronously evaluate whether output A is preferred to output B. """Asynchronously evaluate whether output A is preferred to output B.
Args: Args:
output_a (str): The output string from the first model. prediction (str): The output string from the first model.
output_b (str): The output string from the second model. prediction_b (str): The output string from the second model.
input (str): The input or task string. input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use. callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any. reference (str, optional): The reference string, if any.
@@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
- score: The preference score, which is 1 for 'A', 0 for 'B', - score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None. and 0.5 for None.
""" """
input_ = self._prepare_input(output_a, output_b, input, reference) input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall( result = await self.acall(
inputs=input_, inputs=input_,
callbacks=callbacks, callbacks=callbacks,

View File

@@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
[/QUESTION] [/QUESTION]
[RESPONSE A] [RESPONSE A]
{output_a} {prediction}
[/RESPONSE A] [/RESPONSE A]
[RESPONSE B] [RESPONSE B]
{output_b} {prediction_b}
[/RESPONSE B]""" [/RESPONSE B]"""
PROMPT = PromptTemplate( PROMPT = PromptTemplate(
input_variables=["input", "output_a", "output_b"], template=template input_variables=["input", "prediction", "prediction_b"], template=template
) )
template = """Act as a fair judge and rate the two responses to the question below.\ template = """Act as a fair judge and rate the two responses to the question below.\
@@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
[/QUESTION] [/QUESTION]
[RESPONSE A] [RESPONSE A]
{output_a} {prediction}
[/RESPONSE A] [/RESPONSE A]
[RESPONSE B] [RESPONSE B]
{output_b} {prediction_b}
[/RESPONSE B]""" [/RESPONSE B]"""
PROMPT_WITH_REFERENCE = PromptTemplate( PROMPT_WITH_REFERENCE = PromptTemplate(
input_variables=["input", "output_a", "output_b", "reference"], template=template input_variables=["input", "prediction", "prediction_b", "reference"],
template=template,
) )

View File

@@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
def evaluate_string_pairs( def evaluate_string_pairs(
self, self,
*, *,
output_a: str, prediction: str,
output_b: str, prediction_b: str,
reference: Optional[str] = None, reference: Optional[str] = None,
input: Optional[str] = None, input: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
@@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
"""Evaluate the output string pairs. """Evaluate the output string pairs.
Args: Args:
output_a (str): The output string from the first model. prediction (str): The output string from the first model.
output_b (str): The output string from the second model. prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference reference (str, optional): The expected output / reference
string. Defaults to None. string. Defaults to None.
input (str, optional): The input string. Defaults to None. input (str, optional): The input string. Defaults to None.
@@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
async def aevaluate_string_pairs( async def aevaluate_string_pairs(
self, self,
output_a: str, prediction: str,
output_b: str, prediction_b: str,
reference: Optional[str] = None, reference: Optional[str] = None,
input: Optional[str] = None, input: Optional[str] = None,
**kwargs: Any, **kwargs: Any,
@@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
"""Evaluate the output string pairs. """Evaluate the output string pairs.
Args: Args:
output_a (str): The output string from the first model. prediction (str): The output string from the first model.
output_b (str): The output string from the second model. prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference reference (str, optional): The expected output / reference
string. Defaults to None. string. Defaults to None.
input (str, optional): The input string. Defaults to None. input (str, optional): The input string. Defaults to None.

View File

@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
res = chain.evaluate_agent_trajectory( res = chain.evaluate_agent_trajectory(
input="What is your favorite food?", input="What is your favorite food?",
agent_trajectory=intermediate_steps, agent_trajectory=intermediate_steps,
output="I like pie.", prediction="I like pie.",
) )
assert res["score"] == 5 assert res["score"] == 5
# Test when ref is provided # Test when ref is provided
res = chain.evaluate_agent_trajectory( res = chain.evaluate_agent_trajectory(
input="What is your favorite food?", input="What is your favorite food?",
agent_trajectory=intermediate_steps, agent_trajectory=intermediate_steps,
output="I like pie.", prediction="I like pie.",
reference="Paris", reference="Paris",
) )
assert res["score"] == 1 assert res["score"] == 1
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
res = chain.evaluate_agent_trajectory( res = chain.evaluate_agent_trajectory(
input="What is your favorite food?", input="What is your favorite food?",
agent_trajectory=intermediate_steps, agent_trajectory=intermediate_steps,
output="I like pie.", prediction="I like pie.",
) )
assert res["score"] == 5 assert res["score"] == 5
res = chain.evaluate_agent_trajectory( res = chain.evaluate_agent_trajectory(
input="What is your favorite food?", input="What is your favorite food?",
agent_trajectory=intermediate_steps, agent_trajectory=intermediate_steps,
output="I like pie.", prediction="I like pie.",
reference="Paris", reference="Paris",
) )
assert res["score"] == 1 assert res["score"] == 1

View File

@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
) )
chain = PairwiseStringEvalChain.from_llm(llm=llm) chain = PairwiseStringEvalChain.from_llm(llm=llm)
res = chain.evaluate_string_pairs( res = chain.evaluate_string_pairs(
output_a="I like pie.", prediction="I like pie.",
output_b="I love pie.", prediction_b="I love pie.",
input="What is your favorite food?", input="What is your favorite food?",
) )
assert res["value"] is None assert res["value"] is None
assert res["score"] == 0.5 assert res["score"] == 0.5
assert res["reasoning"] == "The values are the same." assert res["reasoning"] == "The values are the same."
res = chain.evaluate_string_pairs( res = chain.evaluate_string_pairs(
output_a="I like pie.", prediction="I like pie.",
output_b="I like pie.", prediction_b="I like pie.",
input="What is your favorite food?", input="What is your favorite food?",
) )
assert res["value"] == "A" assert res["value"] == "A"
assert res["score"] == 1 assert res["score"] == 1
res = chain.evaluate_string_pairs( res = chain.evaluate_string_pairs(
output_a="I like pie.", prediction="I like pie.",
output_b="I hate pie.", prediction_b="I hate pie.",
input="What is your favorite food?", input="What is your favorite food?",
) )
assert res["value"] == "B" assert res["value"] == "B"