Normalize Option in Scoring Chain (#11412)

This commit is contained in:
William FH
2023-10-04 15:59:28 -07:00
committed by GitHub
parent b9fad28f5e
commit 940b9ae30a
6 changed files with 1088 additions and 818 deletions

View File

@@ -77,6 +77,10 @@ from langchain.evaluation.schema import (
PairwiseStringEvaluator,
StringEvaluator,
)
from langchain.evaluation.scoring import (
LabeledScoreStringEvalChain,
ScoreStringEvalChain,
)
from langchain.evaluation.string_distance import (
PairwiseStringDistanceEvalChain,
StringDistance,
@@ -108,4 +112,6 @@ __all__ = [
"load_evaluator",
"load_dataset",
"AgentTrajectoryEvaluator",
"ScoreStringEvalChain",
"LabeledScoreStringEvalChain",
]

View File

@@ -173,6 +173,10 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
output_parser: BaseOutputParser = Field(
default_factory=ScoreStringResultOutputParser
)
normalize_by: Optional[float] = None
"""The value to normalize the score by, if specified."""
criterion_name: str
"""The name of the criterion being evaluated."""
class Config:
"""Configuration for the ScoreStringEvalChain."""
@@ -199,6 +203,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
"""
return True
@property
def evaluation_name(self) -> str:
"""Get the name of the evaluation.
Returns
-------
str
The name of the evaluation.
"""
return f"score_string:{self.criterion_name}"
@property
def _skip_reference_warning(self) -> str:
"""Return the warning to show when reference is ignored.
@@ -220,6 +235,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
*,
prompt: Optional[PromptTemplate] = None,
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
normalize_by: Optional[float] = None,
**kwargs: Any,
) -> ScoreStringEvalChain:
"""Initialize the ScoreStringEvalChain from an LLM.
@@ -230,7 +246,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
**kwargs (Any): Additional keyword arguments.
Returns:
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
ScoreStringEvalChain: The initialized ScoreStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
@@ -253,11 +269,21 @@ Performance may be significantly worse with other models."
f"but got {prompt_.input_variables}"
)
criteria_ = resolve_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
criteria_str = "\n".join(
f"{k}: {v}" if v else k for k, v in criteria_.items()
).strip()
criteria_str = (
CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA
CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
if criteria_str
else DEFAULT_CRITERIA
)
return cls(
llm=llm,
prompt=prompt_.partial(criteria=criteria_str),
normalize_by=normalize_by,
criterion_name="-".join(criteria_),
**kwargs,
)
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
def _prepare_input(
self,
@@ -290,6 +316,8 @@ Performance may be significantly worse with other models."
parsed = result[self.output_key]
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
if "score" in parsed and self.normalize_by is not None:
parsed["score"] = parsed["score"] / self.normalize_by
return parsed
def _evaluate_strings(
@@ -392,6 +420,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
*,
prompt: Optional[PromptTemplate] = None,
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
normalize_by: Optional[float] = None,
**kwargs: Any,
) -> LabeledScoreStringEvalChain:
"""Initialize the LabeledScoreStringEvalChain from an LLM.
@@ -400,6 +429,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
llm (BaseLanguageModel): The LLM to use.
prompt (PromptTemplate, optional): The prompt to use.
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
normalize_by (float, optional): The value to normalize the score by.
**kwargs (Any): Additional keyword arguments.
Returns:
@@ -422,6 +452,16 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
f"but got {prompt_.input_variables}"
)
criteria_ = resolve_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items()).strip()
criteria_str = (
CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
if criteria_str
else DEFAULT_CRITERIA
)
return cls(
llm=llm,
prompt=prompt_.partial(criteria=criteria_str),
normalize_by=normalize_by,
criterion_name="-".join(criteria_),
**kwargs,
)

View File

@@ -39,9 +39,10 @@ SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
("system", SYSTEM_MESSAGE),
(
"human",
'[Instruction]\nPlease act as an impartial judge \
"[Instruction]\nPlease act as an impartial judge \
and evaluate the quality of the response provided by an AI \
assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
assistant to the user question displayed below. {criteria}"
'[Ground truth]\n{reference}\nBegin your evaluation \
by providing a short explanation. Be as objective as possible. \
After providing your explanation, you must rate the response on a scale of 1 to 10 \
by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\

View File

@@ -291,4 +291,40 @@ class RunEvalConfig(BaseModel):
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
flags: int = 0
# TODO: Trajectory
class ScoreString(EvalConfig):
"""Configuration for a score string evaluator.
This is like the criteria evaluator but it is configured by
default to return a score on the scale from 1-10.
It is recommended to normalize these scores
by setting `normalize_by` to 10.
Parameters
----------
criteria : Optional[CRITERIA_TYPE]
The criteria to evaluate.
llm : Optional[BaseLanguageModel]
The language model to use for the evaluation chain.
normalize_by: Optional[int] = None
If you want to normalize the score, the denominator to use.
If not provided, the score will be between 1 and 10 (by default).
prompt : Optional[BasePromptTemplate]
"""
evaluator_type: EvaluatorType = EvaluatorType.SCORE_STRING
criteria: Optional[CRITERIA_TYPE] = None
llm: Optional[BaseLanguageModel] = None
normalize_by: Optional[float] = None
prompt: Optional[BasePromptTemplate] = None
def __init__(
self,
criteria: Optional[CRITERIA_TYPE] = None,
normalize_by: Optional[float] = None,
**kwargs: Any
) -> None:
super().__init__(criteria=criteria, normalize_by=normalize_by, **kwargs)
class LabeledScoreString(ScoreString):
evaluator_type: EvaluatorType = EvaluatorType.LABELED_SCORE_STRING