mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 06:23:20 +00:00
Simplify eval arg names (#6944)
It'll be easier to switch between these if the names of predictions are consistent
This commit is contained in:
@@ -243,8 +243,8 @@
|
||||
" pred_a, pred_b = res_b, res_a\n",
|
||||
" a, b = \"b\", \"a\"\n",
|
||||
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||
" prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||
" prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||
" input=input_\n",
|
||||
" )\n",
|
||||
" if eval_res[\"value\"] == \"A\":\n",
|
||||
|
@@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
|
||||
result = eval_chain.evaluate_agent_trajectory(
|
||||
input=question,
|
||||
agent_trajectory=response["intermediate_steps"],
|
||||
output=response["output"],
|
||||
prediction=response["output"],
|
||||
reference="Paris",
|
||||
)
|
||||
print(result["score"])
|
||||
@@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
|
||||
def evaluate_agent_trajectory(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
input: str,
|
||||
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
||||
output: str,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
@@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
input (str): The input question.
|
||||
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
||||
The intermediate steps forming the agent trajectory.
|
||||
output (str): The expected output.
|
||||
prediction (str): The expected prediction.
|
||||
reference (Optional[str]): The reference answer.
|
||||
|
||||
Returns:
|
||||
@@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
inputs = {
|
||||
"question": input,
|
||||
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
||||
"answer": output,
|
||||
"answer": prediction,
|
||||
"reference": self._format_reference(reference),
|
||||
}
|
||||
return self(inputs=inputs, callbacks=callbacks, **kwargs)
|
||||
@@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
|
||||
async def aevaluate_agent_trajectory(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
input: str,
|
||||
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
||||
output: str,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
@@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
input (str): The input question.
|
||||
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
||||
The intermediate steps forming the agent trajectory.
|
||||
output (str): The expected output.
|
||||
prediction (str): The expected prediction.
|
||||
reference (Optional[str]): The reference answer.
|
||||
|
||||
Returns:
|
||||
@@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
inputs = {
|
||||
"question": input,
|
||||
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
||||
"answer": output,
|
||||
"answer": prediction,
|
||||
"reference": self._format_reference(reference),
|
||||
}
|
||||
return await self.acall(
|
||||
|
@@ -12,8 +12,8 @@ Example:
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... output_a = "H2O",
|
||||
... output_b = (
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... referenc = "The chemical formula for water is H2O.",
|
||||
|
@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... output_a = "H2O",
|
||||
... output_b = (
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... referenc = "The chemical formula for water is H2O.",
|
||||
@@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
Returns:
|
||||
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
||||
"""
|
||||
expected_input_vars = {"output_a", "output_b", "input"}
|
||||
expected_input_vars = {"prediction", "prediction_b", "input"}
|
||||
if prompt is None:
|
||||
if require_reference:
|
||||
expected_input_vars.add("reference")
|
||||
@@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
return cls(llm=llm, prompt=prompt_, **kwargs)
|
||||
|
||||
def _prepare_input(
|
||||
self, output_a: str, output_b: str, input: str, reference: Optional[str]
|
||||
self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
|
||||
) -> dict:
|
||||
input_ = {
|
||||
"output_a": output_a,
|
||||
"output_b": output_b,
|
||||
"prediction": prediction,
|
||||
"prediction_b": prediction_b,
|
||||
"input": input,
|
||||
}
|
||||
if reference is not None and "reference" in self.prompt.input_variables:
|
||||
@@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
def evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
output_a: str,
|
||||
output_b: str,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: str,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
@@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
"""Evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
output_a (str): The output string from the first model.
|
||||
output_b (str): The output string from the second model.
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
@@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
"""
|
||||
input_ = self._prepare_input(output_a, output_b, input, reference)
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
@@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
async def aevaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
output_a: str,
|
||||
output_b: str,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: str,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
@@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
output_a (str): The output string from the first model.
|
||||
output_b (str): The output string from the second model.
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
@@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
"""
|
||||
input_ = self._prepare_input(output_a, output_b, input, reference)
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
|
@@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
|
||||
[/QUESTION]
|
||||
|
||||
[RESPONSE A]
|
||||
{output_a}
|
||||
{prediction}
|
||||
[/RESPONSE A]
|
||||
|
||||
[RESPONSE B]
|
||||
{output_b}
|
||||
{prediction_b}
|
||||
[/RESPONSE B]"""
|
||||
PROMPT = PromptTemplate(
|
||||
input_variables=["input", "output_a", "output_b"], template=template
|
||||
input_variables=["input", "prediction", "prediction_b"], template=template
|
||||
)
|
||||
|
||||
template = """Act as a fair judge and rate the two responses to the question below.\
|
||||
@@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
|
||||
[/QUESTION]
|
||||
|
||||
[RESPONSE A]
|
||||
{output_a}
|
||||
{prediction}
|
||||
[/RESPONSE A]
|
||||
|
||||
[RESPONSE B]
|
||||
{output_b}
|
||||
{prediction_b}
|
||||
[/RESPONSE B]"""
|
||||
|
||||
PROMPT_WITH_REFERENCE = PromptTemplate(
|
||||
input_variables=["input", "output_a", "output_b", "reference"], template=template
|
||||
input_variables=["input", "prediction", "prediction_b", "reference"],
|
||||
template=template,
|
||||
)
|
||||
|
@@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
|
||||
def evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
output_a: str,
|
||||
output_b: str,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
reference: Optional[str] = None,
|
||||
input: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
@@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
|
||||
"""Evaluate the output string pairs.
|
||||
|
||||
Args:
|
||||
output_a (str): The output string from the first model.
|
||||
output_b (str): The output string from the second model.
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
reference (str, optional): The expected output / reference
|
||||
string. Defaults to None.
|
||||
input (str, optional): The input string. Defaults to None.
|
||||
@@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
|
||||
|
||||
async def aevaluate_string_pairs(
|
||||
self,
|
||||
output_a: str,
|
||||
output_b: str,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
reference: Optional[str] = None,
|
||||
input: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
@@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
|
||||
"""Evaluate the output string pairs.
|
||||
|
||||
Args:
|
||||
output_a (str): The output string from the first model.
|
||||
output_b (str): The output string from the second model.
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
reference (str, optional): The expected output / reference
|
||||
string. Defaults to None.
|
||||
input (str, optional): The input string. Defaults to None.
|
||||
|
@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
|
||||
res = chain.evaluate_agent_trajectory(
|
||||
input="What is your favorite food?",
|
||||
agent_trajectory=intermediate_steps,
|
||||
output="I like pie.",
|
||||
prediction="I like pie.",
|
||||
)
|
||||
assert res["score"] == 5
|
||||
# Test when ref is provided
|
||||
res = chain.evaluate_agent_trajectory(
|
||||
input="What is your favorite food?",
|
||||
agent_trajectory=intermediate_steps,
|
||||
output="I like pie.",
|
||||
prediction="I like pie.",
|
||||
reference="Paris",
|
||||
)
|
||||
assert res["score"] == 1
|
||||
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
|
||||
res = chain.evaluate_agent_trajectory(
|
||||
input="What is your favorite food?",
|
||||
agent_trajectory=intermediate_steps,
|
||||
output="I like pie.",
|
||||
prediction="I like pie.",
|
||||
)
|
||||
assert res["score"] == 5
|
||||
res = chain.evaluate_agent_trajectory(
|
||||
input="What is your favorite food?",
|
||||
agent_trajectory=intermediate_steps,
|
||||
output="I like pie.",
|
||||
prediction="I like pie.",
|
||||
reference="Paris",
|
||||
)
|
||||
assert res["score"] == 1
|
||||
|
@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
|
||||
)
|
||||
chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
res = chain.evaluate_string_pairs(
|
||||
output_a="I like pie.",
|
||||
output_b="I love pie.",
|
||||
prediction="I like pie.",
|
||||
prediction_b="I love pie.",
|
||||
input="What is your favorite food?",
|
||||
)
|
||||
assert res["value"] is None
|
||||
assert res["score"] == 0.5
|
||||
assert res["reasoning"] == "The values are the same."
|
||||
res = chain.evaluate_string_pairs(
|
||||
output_a="I like pie.",
|
||||
output_b="I like pie.",
|
||||
prediction="I like pie.",
|
||||
prediction_b="I like pie.",
|
||||
input="What is your favorite food?",
|
||||
)
|
||||
assert res["value"] == "A"
|
||||
assert res["score"] == 1
|
||||
res = chain.evaluate_string_pairs(
|
||||
output_a="I like pie.",
|
||||
output_b="I hate pie.",
|
||||
prediction="I like pie.",
|
||||
prediction_b="I hate pie.",
|
||||
input="What is your favorite food?",
|
||||
)
|
||||
assert res["value"] == "B"
|
||||
|
Reference in New Issue
Block a user