mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-15 14:36:54 +00:00
Normalize Option in Scoring Chain (#11412)
This commit is contained in:
@@ -77,6 +77,10 @@ from langchain.evaluation.schema import (
|
||||
PairwiseStringEvaluator,
|
||||
StringEvaluator,
|
||||
)
|
||||
from langchain.evaluation.scoring import (
|
||||
LabeledScoreStringEvalChain,
|
||||
ScoreStringEvalChain,
|
||||
)
|
||||
from langchain.evaluation.string_distance import (
|
||||
PairwiseStringDistanceEvalChain,
|
||||
StringDistance,
|
||||
@@ -108,4 +112,6 @@ __all__ = [
|
||||
"load_evaluator",
|
||||
"load_dataset",
|
||||
"AgentTrajectoryEvaluator",
|
||||
"ScoreStringEvalChain",
|
||||
"LabeledScoreStringEvalChain",
|
||||
]
|
||||
|
@@ -173,6 +173,10 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
output_parser: BaseOutputParser = Field(
|
||||
default_factory=ScoreStringResultOutputParser
|
||||
)
|
||||
normalize_by: Optional[float] = None
|
||||
"""The value to normalize the score by, if specified."""
|
||||
criterion_name: str
|
||||
"""The name of the criterion being evaluated."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for the ScoreStringEvalChain."""
|
||||
@@ -199,6 +203,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
"""Get the name of the evaluation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The name of the evaluation.
|
||||
"""
|
||||
return f"score_string:{self.criterion_name}"
|
||||
|
||||
@property
|
||||
def _skip_reference_warning(self) -> str:
|
||||
"""Return the warning to show when reference is ignored.
|
||||
@@ -220,6 +235,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
*,
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
normalize_by: Optional[float] = None,
|
||||
**kwargs: Any,
|
||||
) -> ScoreStringEvalChain:
|
||||
"""Initialize the ScoreStringEvalChain from an LLM.
|
||||
@@ -230,7 +246,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
||||
ScoreStringEvalChain: The initialized ScoreStringEvalChain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
@@ -253,11 +269,21 @@ Performance may be significantly worse with other models."
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
criteria_ = resolve_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
|
||||
criteria_str = "\n".join(
|
||||
f"{k}: {v}" if v else k for k, v in criteria_.items()
|
||||
).strip()
|
||||
criteria_str = (
|
||||
CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA
|
||||
CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
|
||||
if criteria_str
|
||||
else DEFAULT_CRITERIA
|
||||
)
|
||||
return cls(
|
||||
llm=llm,
|
||||
prompt=prompt_.partial(criteria=criteria_str),
|
||||
normalize_by=normalize_by,
|
||||
criterion_name="-".join(criteria_),
|
||||
**kwargs,
|
||||
)
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
|
||||
def _prepare_input(
|
||||
self,
|
||||
@@ -290,6 +316,8 @@ Performance may be significantly worse with other models."
|
||||
parsed = result[self.output_key]
|
||||
if RUN_KEY in result:
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
if "score" in parsed and self.normalize_by is not None:
|
||||
parsed["score"] = parsed["score"] / self.normalize_by
|
||||
return parsed
|
||||
|
||||
def _evaluate_strings(
|
||||
@@ -392,6 +420,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
|
||||
*,
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
normalize_by: Optional[float] = None,
|
||||
**kwargs: Any,
|
||||
) -> LabeledScoreStringEvalChain:
|
||||
"""Initialize the LabeledScoreStringEvalChain from an LLM.
|
||||
@@ -400,6 +429,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
|
||||
llm (BaseLanguageModel): The LLM to use.
|
||||
prompt (PromptTemplate, optional): The prompt to use.
|
||||
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
|
||||
normalize_by (float, optional): The value to normalize the score by.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@@ -422,6 +452,16 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
criteria_ = resolve_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items()).strip()
|
||||
criteria_str = (
|
||||
CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
|
||||
if criteria_str
|
||||
else DEFAULT_CRITERIA
|
||||
)
|
||||
return cls(
|
||||
llm=llm,
|
||||
prompt=prompt_.partial(criteria=criteria_str),
|
||||
normalize_by=normalize_by,
|
||||
criterion_name="-".join(criteria_),
|
||||
**kwargs,
|
||||
)
|
||||
|
@@ -39,9 +39,10 @@ SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
'[Instruction]\nPlease act as an impartial judge \
|
||||
"[Instruction]\nPlease act as an impartial judge \
|
||||
and evaluate the quality of the response provided by an AI \
|
||||
assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
|
||||
assistant to the user question displayed below. {criteria}"
|
||||
'[Ground truth]\n{reference}\nBegin your evaluation \
|
||||
by providing a short explanation. Be as objective as possible. \
|
||||
After providing your explanation, you must rate the response on a scale of 1 to 10 \
|
||||
by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
|
||||
|
@@ -291,4 +291,40 @@ class RunEvalConfig(BaseModel):
|
||||
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
|
||||
flags: int = 0
|
||||
|
||||
# TODO: Trajectory
|
||||
class ScoreString(EvalConfig):
|
||||
"""Configuration for a score string evaluator.
|
||||
This is like the criteria evaluator but it is configured by
|
||||
default to return a score on the scale from 1-10.
|
||||
|
||||
It is recommended to normalize these scores
|
||||
by setting `normalize_by` to 10.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
criteria : Optional[CRITERIA_TYPE]
|
||||
The criteria to evaluate.
|
||||
llm : Optional[BaseLanguageModel]
|
||||
The language model to use for the evaluation chain.
|
||||
normalize_by: Optional[int] = None
|
||||
If you want to normalize the score, the denominator to use.
|
||||
If not provided, the score will be between 1 and 10 (by default).
|
||||
prompt : Optional[BasePromptTemplate]
|
||||
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.SCORE_STRING
|
||||
criteria: Optional[CRITERIA_TYPE] = None
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
normalize_by: Optional[float] = None
|
||||
prompt: Optional[BasePromptTemplate] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
criteria: Optional[CRITERIA_TYPE] = None,
|
||||
normalize_by: Optional[float] = None,
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
super().__init__(criteria=criteria, normalize_by=normalize_by, **kwargs)
|
||||
|
||||
class LabeledScoreString(ScoreString):
|
||||
evaluator_type: EvaluatorType = EvaluatorType.LABELED_SCORE_STRING
|
||||
|
Reference in New Issue
Block a user