mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 15:33:11 +00:00
Simplify eval arg names (#6944)
It'll be easier to switch between these if the names of predictions are consistent
This commit is contained in:
@@ -243,8 +243,8 @@
|
|||||||
" pred_a, pred_b = res_b, res_a\n",
|
" pred_a, pred_b = res_b, res_a\n",
|
||||||
" a, b = \"b\", \"a\"\n",
|
" a, b = \"b\", \"a\"\n",
|
||||||
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||||
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
" prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||||
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
" prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||||
" input=input_\n",
|
" input=input_\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" if eval_res[\"value\"] == \"A\":\n",
|
" if eval_res[\"value\"] == \"A\":\n",
|
||||||
|
@@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
|
|||||||
result = eval_chain.evaluate_agent_trajectory(
|
result = eval_chain.evaluate_agent_trajectory(
|
||||||
input=question,
|
input=question,
|
||||||
agent_trajectory=response["intermediate_steps"],
|
agent_trajectory=response["intermediate_steps"],
|
||||||
output=response["output"],
|
prediction=response["output"],
|
||||||
reference="Paris",
|
reference="Paris",
|
||||||
)
|
)
|
||||||
print(result["score"])
|
print(result["score"])
|
||||||
@@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
def evaluate_agent_trajectory(
|
def evaluate_agent_trajectory(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
prediction: str,
|
||||||
input: str,
|
input: str,
|
||||||
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
||||||
output: str,
|
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
callbacks: Callbacks = None,
|
callbacks: Callbacks = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
@@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
input (str): The input question.
|
input (str): The input question.
|
||||||
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
||||||
The intermediate steps forming the agent trajectory.
|
The intermediate steps forming the agent trajectory.
|
||||||
output (str): The expected output.
|
prediction (str): The expected prediction.
|
||||||
reference (Optional[str]): The reference answer.
|
reference (Optional[str]): The reference answer.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
inputs = {
|
inputs = {
|
||||||
"question": input,
|
"question": input,
|
||||||
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
||||||
"answer": output,
|
"answer": prediction,
|
||||||
"reference": self._format_reference(reference),
|
"reference": self._format_reference(reference),
|
||||||
}
|
}
|
||||||
return self(inputs=inputs, callbacks=callbacks, **kwargs)
|
return self(inputs=inputs, callbacks=callbacks, **kwargs)
|
||||||
@@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
async def aevaluate_agent_trajectory(
|
async def aevaluate_agent_trajectory(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
prediction: str,
|
||||||
input: str,
|
input: str,
|
||||||
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
||||||
output: str,
|
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
callbacks: Callbacks = None,
|
callbacks: Callbacks = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
@@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
input (str): The input question.
|
input (str): The input question.
|
||||||
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
||||||
The intermediate steps forming the agent trajectory.
|
The intermediate steps forming the agent trajectory.
|
||||||
output (str): The expected output.
|
prediction (str): The expected prediction.
|
||||||
reference (Optional[str]): The reference answer.
|
reference (Optional[str]): The reference answer.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
|
|||||||
inputs = {
|
inputs = {
|
||||||
"question": input,
|
"question": input,
|
||||||
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
|
||||||
"answer": output,
|
"answer": prediction,
|
||||||
"reference": self._format_reference(reference),
|
"reference": self._format_reference(reference),
|
||||||
}
|
}
|
||||||
return await self.acall(
|
return await self.acall(
|
||||||
|
@@ -12,8 +12,8 @@ Example:
|
|||||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
>>> result = chain.evaluate_string_pairs(
|
>>> result = chain.evaluate_string_pairs(
|
||||||
... input = "What is the chemical formula for water?",
|
... input = "What is the chemical formula for water?",
|
||||||
... output_a = "H2O",
|
... prediction = "H2O",
|
||||||
... output_b = (
|
... prediction_b = (
|
||||||
... "The chemical formula for water is H2O, which means"
|
... "The chemical formula for water is H2O, which means"
|
||||||
... " there are two hydrogen atoms and one oxygen atom."
|
... " there are two hydrogen atoms and one oxygen atom."
|
||||||
... referenc = "The chemical formula for water is H2O.",
|
... referenc = "The chemical formula for water is H2O.",
|
||||||
|
@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
>>> result = chain.evaluate_string_pairs(
|
>>> result = chain.evaluate_string_pairs(
|
||||||
... input = "What is the chemical formula for water?",
|
... input = "What is the chemical formula for water?",
|
||||||
... output_a = "H2O",
|
... prediction = "H2O",
|
||||||
... output_b = (
|
... prediction_b = (
|
||||||
... "The chemical formula for water is H2O, which means"
|
... "The chemical formula for water is H2O, which means"
|
||||||
... " there are two hydrogen atoms and one oxygen atom."
|
... " there are two hydrogen atoms and one oxygen atom."
|
||||||
... referenc = "The chemical formula for water is H2O.",
|
... referenc = "The chemical formula for water is H2O.",
|
||||||
@@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
Returns:
|
Returns:
|
||||||
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
||||||
"""
|
"""
|
||||||
expected_input_vars = {"output_a", "output_b", "input"}
|
expected_input_vars = {"prediction", "prediction_b", "input"}
|
||||||
if prompt is None:
|
if prompt is None:
|
||||||
if require_reference:
|
if require_reference:
|
||||||
expected_input_vars.add("reference")
|
expected_input_vars.add("reference")
|
||||||
@@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
return cls(llm=llm, prompt=prompt_, **kwargs)
|
return cls(llm=llm, prompt=prompt_, **kwargs)
|
||||||
|
|
||||||
def _prepare_input(
|
def _prepare_input(
|
||||||
self, output_a: str, output_b: str, input: str, reference: Optional[str]
|
self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
|
||||||
) -> dict:
|
) -> dict:
|
||||||
input_ = {
|
input_ = {
|
||||||
"output_a": output_a,
|
"prediction": prediction,
|
||||||
"output_b": output_b,
|
"prediction_b": prediction_b,
|
||||||
"input": input,
|
"input": input,
|
||||||
}
|
}
|
||||||
if reference is not None and "reference" in self.prompt.input_variables:
|
if reference is not None and "reference" in self.prompt.input_variables:
|
||||||
@@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
def evaluate_string_pairs(
|
def evaluate_string_pairs(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
output_a: str,
|
prediction: str,
|
||||||
output_b: str,
|
prediction_b: str,
|
||||||
input: str,
|
input: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
callbacks: Callbacks = None,
|
callbacks: Callbacks = None,
|
||||||
@@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
"""Evaluate whether output A is preferred to output B.
|
"""Evaluate whether output A is preferred to output B.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_a (str): The output string from the first model.
|
prediction (str): The output string from the first model.
|
||||||
output_b (str): The output string from the second model.
|
prediction_b (str): The output string from the second model.
|
||||||
input (str): The input or task string.
|
input (str): The input or task string.
|
||||||
callbacks (Callbacks, optional): The callbacks to use.
|
callbacks (Callbacks, optional): The callbacks to use.
|
||||||
reference (str, optional): The reference string, if any.
|
reference (str, optional): The reference string, if any.
|
||||||
@@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||||
and 0.5 for None.
|
and 0.5 for None.
|
||||||
"""
|
"""
|
||||||
input_ = self._prepare_input(output_a, output_b, input, reference)
|
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||||
result = self(
|
result = self(
|
||||||
inputs=input_,
|
inputs=input_,
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
@@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
async def aevaluate_string_pairs(
|
async def aevaluate_string_pairs(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
output_a: str,
|
prediction: str,
|
||||||
output_b: str,
|
prediction_b: str,
|
||||||
input: str,
|
input: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
callbacks: Callbacks = None,
|
callbacks: Callbacks = None,
|
||||||
@@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
"""Asynchronously evaluate whether output A is preferred to output B.
|
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_a (str): The output string from the first model.
|
prediction (str): The output string from the first model.
|
||||||
output_b (str): The output string from the second model.
|
prediction_b (str): The output string from the second model.
|
||||||
input (str): The input or task string.
|
input (str): The input or task string.
|
||||||
callbacks (Callbacks, optional): The callbacks to use.
|
callbacks (Callbacks, optional): The callbacks to use.
|
||||||
reference (str, optional): The reference string, if any.
|
reference (str, optional): The reference string, if any.
|
||||||
@@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
|
|||||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||||
and 0.5 for None.
|
and 0.5 for None.
|
||||||
"""
|
"""
|
||||||
input_ = self._prepare_input(output_a, output_b, input, reference)
|
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||||
result = await self.acall(
|
result = await self.acall(
|
||||||
inputs=input_,
|
inputs=input_,
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
|
@@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
|
|||||||
[/QUESTION]
|
[/QUESTION]
|
||||||
|
|
||||||
[RESPONSE A]
|
[RESPONSE A]
|
||||||
{output_a}
|
{prediction}
|
||||||
[/RESPONSE A]
|
[/RESPONSE A]
|
||||||
|
|
||||||
[RESPONSE B]
|
[RESPONSE B]
|
||||||
{output_b}
|
{prediction_b}
|
||||||
[/RESPONSE B]"""
|
[/RESPONSE B]"""
|
||||||
PROMPT = PromptTemplate(
|
PROMPT = PromptTemplate(
|
||||||
input_variables=["input", "output_a", "output_b"], template=template
|
input_variables=["input", "prediction", "prediction_b"], template=template
|
||||||
)
|
)
|
||||||
|
|
||||||
template = """Act as a fair judge and rate the two responses to the question below.\
|
template = """Act as a fair judge and rate the two responses to the question below.\
|
||||||
@@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
|
|||||||
[/QUESTION]
|
[/QUESTION]
|
||||||
|
|
||||||
[RESPONSE A]
|
[RESPONSE A]
|
||||||
{output_a}
|
{prediction}
|
||||||
[/RESPONSE A]
|
[/RESPONSE A]
|
||||||
|
|
||||||
[RESPONSE B]
|
[RESPONSE B]
|
||||||
{output_b}
|
{prediction_b}
|
||||||
[/RESPONSE B]"""
|
[/RESPONSE B]"""
|
||||||
|
|
||||||
PROMPT_WITH_REFERENCE = PromptTemplate(
|
PROMPT_WITH_REFERENCE = PromptTemplate(
|
||||||
input_variables=["input", "output_a", "output_b", "reference"], template=template
|
input_variables=["input", "prediction", "prediction_b", "reference"],
|
||||||
|
template=template,
|
||||||
)
|
)
|
||||||
|
@@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
|
|||||||
def evaluate_string_pairs(
|
def evaluate_string_pairs(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
output_a: str,
|
prediction: str,
|
||||||
output_b: str,
|
prediction_b: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
input: Optional[str] = None,
|
input: Optional[str] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
@@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
|
|||||||
"""Evaluate the output string pairs.
|
"""Evaluate the output string pairs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_a (str): The output string from the first model.
|
prediction (str): The output string from the first model.
|
||||||
output_b (str): The output string from the second model.
|
prediction_b (str): The output string from the second model.
|
||||||
reference (str, optional): The expected output / reference
|
reference (str, optional): The expected output / reference
|
||||||
string. Defaults to None.
|
string. Defaults to None.
|
||||||
input (str, optional): The input string. Defaults to None.
|
input (str, optional): The input string. Defaults to None.
|
||||||
@@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
|
|||||||
|
|
||||||
async def aevaluate_string_pairs(
|
async def aevaluate_string_pairs(
|
||||||
self,
|
self,
|
||||||
output_a: str,
|
prediction: str,
|
||||||
output_b: str,
|
prediction_b: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
input: Optional[str] = None,
|
input: Optional[str] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
@@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
|
|||||||
"""Evaluate the output string pairs.
|
"""Evaluate the output string pairs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_a (str): The output string from the first model.
|
prediction (str): The output string from the first model.
|
||||||
output_b (str): The output string from the second model.
|
prediction_b (str): The output string from the second model.
|
||||||
reference (str, optional): The expected output / reference
|
reference (str, optional): The expected output / reference
|
||||||
string. Defaults to None.
|
string. Defaults to None.
|
||||||
input (str, optional): The input string. Defaults to None.
|
input (str, optional): The input string. Defaults to None.
|
||||||
|
@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
|
|||||||
res = chain.evaluate_agent_trajectory(
|
res = chain.evaluate_agent_trajectory(
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
agent_trajectory=intermediate_steps,
|
agent_trajectory=intermediate_steps,
|
||||||
output="I like pie.",
|
prediction="I like pie.",
|
||||||
)
|
)
|
||||||
assert res["score"] == 5
|
assert res["score"] == 5
|
||||||
# Test when ref is provided
|
# Test when ref is provided
|
||||||
res = chain.evaluate_agent_trajectory(
|
res = chain.evaluate_agent_trajectory(
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
agent_trajectory=intermediate_steps,
|
agent_trajectory=intermediate_steps,
|
||||||
output="I like pie.",
|
prediction="I like pie.",
|
||||||
reference="Paris",
|
reference="Paris",
|
||||||
)
|
)
|
||||||
assert res["score"] == 1
|
assert res["score"] == 1
|
||||||
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
|
|||||||
res = chain.evaluate_agent_trajectory(
|
res = chain.evaluate_agent_trajectory(
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
agent_trajectory=intermediate_steps,
|
agent_trajectory=intermediate_steps,
|
||||||
output="I like pie.",
|
prediction="I like pie.",
|
||||||
)
|
)
|
||||||
assert res["score"] == 5
|
assert res["score"] == 5
|
||||||
res = chain.evaluate_agent_trajectory(
|
res = chain.evaluate_agent_trajectory(
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
agent_trajectory=intermediate_steps,
|
agent_trajectory=intermediate_steps,
|
||||||
output="I like pie.",
|
prediction="I like pie.",
|
||||||
reference="Paris",
|
reference="Paris",
|
||||||
)
|
)
|
||||||
assert res["score"] == 1
|
assert res["score"] == 1
|
||||||
|
@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
|
|||||||
)
|
)
|
||||||
chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
res = chain.evaluate_string_pairs(
|
res = chain.evaluate_string_pairs(
|
||||||
output_a="I like pie.",
|
prediction="I like pie.",
|
||||||
output_b="I love pie.",
|
prediction_b="I love pie.",
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
)
|
)
|
||||||
assert res["value"] is None
|
assert res["value"] is None
|
||||||
assert res["score"] == 0.5
|
assert res["score"] == 0.5
|
||||||
assert res["reasoning"] == "The values are the same."
|
assert res["reasoning"] == "The values are the same."
|
||||||
res = chain.evaluate_string_pairs(
|
res = chain.evaluate_string_pairs(
|
||||||
output_a="I like pie.",
|
prediction="I like pie.",
|
||||||
output_b="I like pie.",
|
prediction_b="I like pie.",
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
)
|
)
|
||||||
assert res["value"] == "A"
|
assert res["value"] == "A"
|
||||||
assert res["score"] == 1
|
assert res["score"] == 1
|
||||||
res = chain.evaluate_string_pairs(
|
res = chain.evaluate_string_pairs(
|
||||||
output_a="I like pie.",
|
prediction="I like pie.",
|
||||||
output_b="I hate pie.",
|
prediction_b="I hate pie.",
|
||||||
input="What is your favorite food?",
|
input="What is your favorite food?",
|
||||||
)
|
)
|
||||||
assert res["value"] == "B"
|
assert res["value"] == "B"
|
||||||
|
Reference in New Issue
Block a user