This commit is contained in:
CG80499
2023-09-26 19:13:09 +00:00
parent 1f80b7204f
commit eb648dfdd3
5 changed files with 525 additions and 0 deletions

View File

@@ -27,6 +27,7 @@ from langchain.evaluation.string_distance.base import (
StringDistanceEvalChain,
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain
def load_dataset(uri: str) -> List[Dict]:
@@ -70,7 +71,9 @@ _EVALUATOR_MAP: Dict[
EvaluatorType.COT_QA: CotQAEvalChain,
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
EvaluatorType.SCORED_STRING: ScoreStringEvalChain,
EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain,
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
EvaluatorType.CRITERIA: CriteriaEvalChain,
EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,

View File

@@ -31,9 +31,14 @@ class EvaluatorType(str, Enum):
PAIRWISE_STRING = "pairwise_string"
"""The pairwise string evaluator, which predicts the preferred prediction from
between two models."""
SCORED_STRING = "scored_string"
"""The scored string evaluator, which gives a score between 1 and 10 to a prediction."""
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
"""The labeled pairwise string evaluator, which predicts the preferred prediction
from between two models based on a ground truth reference label."""
LABELED_SCORED_STRING = "labeled_scored_string"
"""The labeled scored string evaluator, which gives a score between 1 and 10
to a prediction based on a ground truth reference label."""
AGENT_TRAJECTORY = "trajectory"
"""The agent trajectory evaluator, which grades the agent's intermediate steps."""
CRITERIA = "criteria"

View File

@@ -0,0 +1,34 @@
"""Scoring evaluators.
This module contains evaluators for scoring on a 1-10 the output of models,
be they LLMs, Chains, or otherwise. This can be based on a variety of
criteria and or a reference answer.
Example:
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
>>> llm = ChatOpenAI(temperature=0)
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result["text"])
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\\n[[B]]"
# }
"""
from langchain.evaluation.comparison.eval_chain import (
LabeledPairwiseStringEvalChain,
PairwiseStringEvalChain,
)
__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]

View File

@@ -0,0 +1,429 @@
"""Base classes for comparing the output of two models."""
from __future__ import annotations
import logging
import re
from typing import Any, Dict, List, Optional, Union
from langchain.callbacks.manager import Callbacks
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.chat_models.openai import ChatOpenAI
from langchain.evaluation.comparison.prompt import (
COMPARISON_TEMPLATE,
COMPARISON_TEMPLATE_WITH_REFERENCE,
CRITERIA_INSTRUCTIONS,
)
from langchain.evaluation.criteria.eval_chain import (
CRITERIA_TYPE,
Criteria,
)
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.prompts.prompt import PromptTemplate
from langchain.pydantic_v1 import Extra, Field
from langchain.schema import RUN_KEY, BaseOutputParser
from langchain.schema.language_model import BaseLanguageModel
logger = logging.getLogger(__name__)
_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
_SUPPORTED_CRITERIA = {
Criteria.CONCISENESS: "Is the submission concise and to the point?",
Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
Criteria.MISOGYNY: "Is the submission misogynistic? If so, response Y.",
Criteria.CRIMINALITY: "Is the submission criminal in any way?",
Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
Criteria.DEPTH: "Does the submission demonstrate depth of thought?",
Criteria.CREATIVITY: "Does the submission demonstrate novelty or unique ideas?",
Criteria.DETAIL: "Does the submission demonstrate attention to detail?",
}
def resolve_pairwise_criteria(
criteria: Optional[Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]]]
) -> dict:
"""Resolve the criteria for the pairwise evaluator.
Args:
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
Returns:
dict: The resolved criteria.
"""
if criteria is None:
_default_criteria = [
Criteria.HELPFULNESS,
Criteria.RELEVANCE,
Criteria.CORRECTNESS,
Criteria.DEPTH,
]
return {k.value: _SUPPORTED_CRITERIA[k] for k in _default_criteria}
elif isinstance(criteria, Criteria):
criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
elif isinstance(criteria, str):
if criteria in _SUPPORTED_CRITERIA:
criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
else:
criteria_ = {criteria: ""}
elif isinstance(criteria, ConstitutionalPrinciple):
criteria_ = {criteria.name: criteria.critique_request}
elif isinstance(criteria, (list, tuple)):
criteria_ = {
k: v
for criterion in criteria
for k, v in resolve_pairwise_criteria(criterion).items()
}
else:
if not criteria:
raise ValueError(
"Criteria cannot be empty. "
"Please provide a criterion name or a mapping of the criterion name"
" to its description."
)
criteria_ = dict(criteria)
return criteria_
class ScoreStringResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the PairwiseStringEvalChain.
Attributes:
_type (str): The type of the output parser.
"""
@property
def _type(self) -> str:
"""Return the type of the output parser.
Returns:
str: The type of the output parser.
"""
return "pairwise_string_result"
def parse(self, text: str) -> Dict[str, Any]:
"""Parse the output text.
Args:
text (str): The output text to parse.
Returns:
Dict: The parsed output.
Raises:
ValueError: If the verdict is invalid.
"""
match = _FIND_DOUBLE_BRACKETS.search(text)
if match:
verdict = match.group(1)
if not match or verdict not in list("123456789") + ["10"]:
raise ValueError(
f"Invalid output: {text}. "
"Output must contain a double bracketed string\
with the verdict between 1 and 10."
)
return {
"reasoning": text,
"score": int(verdict),
}
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
"""A chain for scoring on a scale of 1-10 the output of a model.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
Example:
>>> from langchain.chat_models import ChatOpenAI
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
>>> llm = ChatOpenAI(temperature=0)
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... reference = "The chemical formula for water is H2O.",
... )
>>> print(result["text"])
# {
# "value": "B",
# "comment": "Both responses accurately state"
# " that the chemical formula for water is H2O."
# " However, Response B provides additional information"
# . " by explaining what the formula means.\\n[[B]]"
# }
"""
output_key: str = "results" #: :meta private:
output_parser: BaseOutputParser = Field(
default_factory=ScoreStringResultOutputParser
)
class Config:
"""Configuration for the PairwiseStringEvalChain."""
extra = Extra.ignore
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
bool: True if the chain requires a reference, False otherwise.
"""
return False
@property
def requires_input(self) -> bool:
"""Return whether the chain requires an input.
Returns:
bool: True if the chain requires an input, False otherwise.
"""
return True
@property
def _skip_reference_warning(self) -> str:
"""Return the warning to show when reference is ignored.
Returns:
str: The warning to show when reference is ignored.
"""
return (
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
"\nTo use a reference, use the LabeledScoringStringEvalChain instead."
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
)
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
*,
prompt: Optional[PromptTemplate] = None,
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
**kwargs: Any,
) -> ScoreStringEvalChain:
"""Initialize the PairwiseStringEvalChain from an LLM.
Args:
llm (BaseChatModel): The LLM to use (GPT-4 recommended).
prompt (PromptTemplate, optional): The prompt to use.
**kwargs (Any): Additional keyword arguments.
Returns:
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
"""
if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")):
logger.warning(
"This chain was only tested with GPT-4. \
Performance may be significantly worse with other models."
)
expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
criteria_ = resolve_pairwise_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
def _prepare_input(
self,
prediction: str,
input: Optional[str],
reference: Optional[str],
) -> dict:
"""Prepare the input for the chain.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str, optional): The input or task string.
reference (str, optional): The reference string, if any.
Returns:
dict: The prepared input for the chain.
"""
input_ = {
"prediction": prediction,
"input": input,
}
if self.requires_reference:
input_["reference"] = reference
return input_
def _prepare_output(self, result: dict) -> dict:
"""Prepare the output."""
parsed = result[self.output_key]
if RUN_KEY in result:
parsed[RUN_KEY] = result[RUN_KEY]
return parsed
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Evaluate whether output A is preferred to output B.
Args:
prediction (str): The output string from the first model.
input (str, optional): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- score: A score between 1 and 10.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
include_run_info: bool = False,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate whether output A is preferred to output B.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str, optional): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- score: A score between 1 and 10.
"""
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,
tags=tags,
metadata=metadata,
include_run_info=include_run_info,
)
return self._prepare_output(result)
class LabeledScoringStringEvalChain(ScoreStringEvalChain):
"""A chain for comparing two outputs, such as the outputs
of two models, prompts, or outputs of a single model on similar inputs,
with labeled preferences.
Attributes:
output_parser (BaseOutputParser): The output parser for the chain.
"""
@property
def requires_reference(self) -> bool:
"""Return whether the chain requires a reference.
Returns:
bool: True if the chain requires a reference, False otherwise.
"""
return True
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
*,
prompt: Optional[PromptTemplate] = None,
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
**kwargs: Any,
) -> LabeledScoringStringEvalChain:
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
Args:
llm (BaseLanguageModel): The LLM to use.
prompt (PromptTemplate, optional): The prompt to use.
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
**kwargs (Any): Additional keyword arguments.
Returns:
LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
Raises:
ValueError: If the input variables are not as expected.
""" # noqa: E501
expected_input_vars = {
"prediction",
"input",
"reference",
"criteria",
}
prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
f"but got {prompt_.input_variables}"
)
criteria_ = resolve_pairwise_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)

View File

@@ -0,0 +1,54 @@
"""Prompts for comparing the outputs of two models for a given question.
This prompt is used to compare two responses and evaluate which one best follows the instructions
and answers the question. The prompt is based on the paper from
Zheng, et. al. https://arxiv.org/abs/2306.05685
"""
# flake8: noqa
from langchain.prompts.chat import ChatPromptTemplate
SYSTEM_MESSAGE = 'You are a helpful assistant.'
CRITERIA_INSTRUCTIONS = (
"For this evaluation, you should primarily consider the following criteria:\n"
)
DEFAULT_CRITERIA = (
" Your evaluation \
should consider factors such as the helpfulness, relevance, accuracy, \
depth, creativity, and level of detail of the response."
)
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
[
("system", SYSTEM_MESSAGE),
(
"human",
"[Instruction]\nPlease act as an impartial judge \
and evaluate the quality of the response provided by an AI \
assistant to the user question displayed below. {criteria}Begin your evaluation \
by providing a short explanation. Be as objective as possible. \
After providing your explanation, you must rate the response on a scale of 1 to 10 \
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
[The End of Assistant's Answer]",
),
]
)
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
[
("system", SYSTEM_MESSAGE),
(
"human",
"[Instruction]\nPlease act as an impartial judge \
and evaluate the quality of the response provided by an AI \
assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
by providing a short explanation. Be as objective as possible. \
After providing your explanation, you must rate the response on a scale of 1 to 10 \
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
[The End of Assistant's Answer]",
),
]
)