mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-21 14:43:07 +00:00
latest
This commit is contained in:
@@ -27,6 +27,7 @@ from langchain.evaluation.string_distance.base import (
|
||||
StringDistanceEvalChain,
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain
|
||||
|
||||
|
||||
def load_dataset(uri: str) -> List[Dict]:
|
||||
@@ -70,7 +71,9 @@ _EVALUATOR_MAP: Dict[
|
||||
EvaluatorType.COT_QA: CotQAEvalChain,
|
||||
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
|
||||
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
|
||||
EvaluatorType.SCORED_STRING: ScoreStringEvalChain,
|
||||
EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
|
||||
EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain,
|
||||
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
|
||||
EvaluatorType.CRITERIA: CriteriaEvalChain,
|
||||
EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
|
||||
|
||||
@@ -31,9 +31,14 @@ class EvaluatorType(str, Enum):
|
||||
PAIRWISE_STRING = "pairwise_string"
|
||||
"""The pairwise string evaluator, which predicts the preferred prediction from
|
||||
between two models."""
|
||||
SCORED_STRING = "scored_string"
|
||||
"""The scored string evaluator, which gives a score between 1 and 10 to a prediction."""
|
||||
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
|
||||
"""The labeled pairwise string evaluator, which predicts the preferred prediction
|
||||
from between two models based on a ground truth reference label."""
|
||||
LABELED_SCORED_STRING = "labeled_scored_string"
|
||||
"""The labeled scored string evaluator, which gives a score between 1 and 10
|
||||
to a prediction based on a ground truth reference label."""
|
||||
AGENT_TRAJECTORY = "trajectory"
|
||||
"""The agent trajectory evaluator, which grades the agent's intermediate steps."""
|
||||
CRITERIA = "criteria"
|
||||
|
||||
34
libs/langchain/langchain/evaluation/scoring/__init__.py
Normal file
34
libs/langchain/langchain/evaluation/scoring/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Scoring evaluators.
|
||||
|
||||
This module contains evaluators for scoring on a 1-10 the output of models,
|
||||
be they LLMs, Chains, or otherwise. This can be based on a variety of
|
||||
criteria and or a reference answer.
|
||||
|
||||
Example:
|
||||
>>> from langchain.chat_models import ChatOpenAI
|
||||
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result["text"])
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\\n[[B]]"
|
||||
# }
|
||||
"""
|
||||
from langchain.evaluation.comparison.eval_chain import (
|
||||
LabeledPairwiseStringEvalChain,
|
||||
PairwiseStringEvalChain,
|
||||
)
|
||||
|
||||
__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]
|
||||
429
libs/langchain/langchain/evaluation/scoring/eval_chain.py
Normal file
429
libs/langchain/langchain/evaluation/scoring/eval_chain.py
Normal file
@@ -0,0 +1,429 @@
|
||||
"""Base classes for comparing the output of two models."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models.openai import ChatOpenAI
|
||||
from langchain.evaluation.comparison.prompt import (
|
||||
COMPARISON_TEMPLATE,
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE,
|
||||
CRITERIA_INSTRUCTIONS,
|
||||
)
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CRITERIA_TYPE,
|
||||
Criteria,
|
||||
)
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.pydantic_v1 import Extra, Field
|
||||
from langchain.schema import RUN_KEY, BaseOutputParser
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
|
||||
|
||||
_SUPPORTED_CRITERIA = {
|
||||
Criteria.CONCISENESS: "Is the submission concise and to the point?",
|
||||
Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
|
||||
Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
|
||||
Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
|
||||
Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
|
||||
Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
|
||||
Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
|
||||
Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
|
||||
Criteria.MISOGYNY: "Is the submission misogynistic? If so, response Y.",
|
||||
Criteria.CRIMINALITY: "Is the submission criminal in any way?",
|
||||
Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
|
||||
Criteria.DEPTH: "Does the submission demonstrate depth of thought?",
|
||||
Criteria.CREATIVITY: "Does the submission demonstrate novelty or unique ideas?",
|
||||
Criteria.DETAIL: "Does the submission demonstrate attention to detail?",
|
||||
}
|
||||
|
||||
|
||||
def resolve_pairwise_criteria(
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]]]
|
||||
) -> dict:
|
||||
"""Resolve the criteria for the pairwise evaluator.
|
||||
|
||||
Args:
|
||||
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
|
||||
|
||||
Returns:
|
||||
dict: The resolved criteria.
|
||||
|
||||
"""
|
||||
if criteria is None:
|
||||
_default_criteria = [
|
||||
Criteria.HELPFULNESS,
|
||||
Criteria.RELEVANCE,
|
||||
Criteria.CORRECTNESS,
|
||||
Criteria.DEPTH,
|
||||
]
|
||||
return {k.value: _SUPPORTED_CRITERIA[k] for k in _default_criteria}
|
||||
elif isinstance(criteria, Criteria):
|
||||
criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
|
||||
elif isinstance(criteria, str):
|
||||
if criteria in _SUPPORTED_CRITERIA:
|
||||
criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
|
||||
else:
|
||||
criteria_ = {criteria: ""}
|
||||
elif isinstance(criteria, ConstitutionalPrinciple):
|
||||
criteria_ = {criteria.name: criteria.critique_request}
|
||||
elif isinstance(criteria, (list, tuple)):
|
||||
criteria_ = {
|
||||
k: v
|
||||
for criterion in criteria
|
||||
for k, v in resolve_pairwise_criteria(criterion).items()
|
||||
}
|
||||
else:
|
||||
if not criteria:
|
||||
raise ValueError(
|
||||
"Criteria cannot be empty. "
|
||||
"Please provide a criterion name or a mapping of the criterion name"
|
||||
" to its description."
|
||||
)
|
||||
criteria_ = dict(criteria)
|
||||
return criteria_
|
||||
|
||||
|
||||
class ScoreStringResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the PairwiseStringEvalChain.
|
||||
|
||||
Attributes:
|
||||
_type (str): The type of the output parser.
|
||||
|
||||
"""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
"""Return the type of the output parser.
|
||||
|
||||
Returns:
|
||||
str: The type of the output parser.
|
||||
|
||||
"""
|
||||
return "pairwise_string_result"
|
||||
|
||||
def parse(self, text: str) -> Dict[str, Any]:
|
||||
"""Parse the output text.
|
||||
|
||||
Args:
|
||||
text (str): The output text to parse.
|
||||
|
||||
Returns:
|
||||
Dict: The parsed output.
|
||||
|
||||
Raises:
|
||||
ValueError: If the verdict is invalid.
|
||||
|
||||
"""
|
||||
match = _FIND_DOUBLE_BRACKETS.search(text)
|
||||
|
||||
if match:
|
||||
verdict = match.group(1)
|
||||
|
||||
if not match or verdict not in list("123456789") + ["10"]:
|
||||
raise ValueError(
|
||||
f"Invalid output: {text}. "
|
||||
"Output must contain a double bracketed string\
|
||||
with the verdict between 1 and 10."
|
||||
)
|
||||
|
||||
return {
|
||||
"reasoning": text,
|
||||
"score": int(verdict),
|
||||
}
|
||||
|
||||
|
||||
class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"""A chain for scoring on a scale of 1-10 the output of a model.
|
||||
|
||||
Attributes:
|
||||
output_parser (BaseOutputParser): The output parser for the chain.
|
||||
|
||||
Example:
|
||||
>>> from langchain.chat_models import ChatOpenAI
|
||||
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result["text"])
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\\n[[B]]"
|
||||
# }
|
||||
|
||||
"""
|
||||
|
||||
output_key: str = "results" #: :meta private:
|
||||
output_parser: BaseOutputParser = Field(
|
||||
default_factory=ScoreStringResultOutputParser
|
||||
)
|
||||
|
||||
class Config:
|
||||
"""Configuration for the PairwiseStringEvalChain."""
|
||||
|
||||
extra = Extra.ignore
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether the chain requires a reference.
|
||||
|
||||
Returns:
|
||||
bool: True if the chain requires a reference, False otherwise.
|
||||
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
"""Return whether the chain requires an input.
|
||||
|
||||
Returns:
|
||||
bool: True if the chain requires an input, False otherwise.
|
||||
|
||||
"""
|
||||
return True
|
||||
|
||||
@property
|
||||
def _skip_reference_warning(self) -> str:
|
||||
"""Return the warning to show when reference is ignored.
|
||||
|
||||
Returns:
|
||||
str: The warning to show when reference is ignored.
|
||||
|
||||
"""
|
||||
return (
|
||||
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
|
||||
"\nTo use a reference, use the LabeledScoringStringEvalChain instead."
|
||||
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> ScoreStringEvalChain:
|
||||
"""Initialize the PairwiseStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm (BaseChatModel): The LLM to use (GPT-4 recommended).
|
||||
prompt (PromptTemplate, optional): The prompt to use.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
|
||||
"""
|
||||
if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")):
|
||||
logger.warning(
|
||||
"This chain was only tested with GPT-4. \
|
||||
Performance may be significantly worse with other models."
|
||||
)
|
||||
|
||||
expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
criteria_ = resolve_pairwise_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
|
||||
def _prepare_input(
|
||||
self,
|
||||
prediction: str,
|
||||
input: Optional[str],
|
||||
reference: Optional[str],
|
||||
) -> dict:
|
||||
"""Prepare the input for the chain.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str, optional): The input or task string.
|
||||
reference (str, optional): The reference string, if any.
|
||||
|
||||
Returns:
|
||||
dict: The prepared input for the chain.
|
||||
|
||||
"""
|
||||
input_ = {
|
||||
"prediction": prediction,
|
||||
"input": input,
|
||||
}
|
||||
if self.requires_reference:
|
||||
input_["reference"] = reference
|
||||
return input_
|
||||
|
||||
def _prepare_output(self, result: dict) -> dict:
|
||||
"""Prepare the output."""
|
||||
parsed = result[self.output_key]
|
||||
if RUN_KEY in result:
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
input (str, optional): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- score: A score between 1 and 10.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
async def _aevaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
reference: Optional[str] = None,
|
||||
input: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str, optional): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- score: A score between 1 and 10.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
include_run_info=include_run_info,
|
||||
)
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
class LabeledScoringStringEvalChain(ScoreStringEvalChain):
|
||||
"""A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs,
|
||||
with labeled preferences.
|
||||
|
||||
Attributes:
|
||||
output_parser (BaseOutputParser): The output parser for the chain.
|
||||
|
||||
"""
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether the chain requires a reference.
|
||||
|
||||
Returns:
|
||||
bool: True if the chain requires a reference, False otherwise.
|
||||
|
||||
"""
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> LabeledScoringStringEvalChain:
|
||||
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm (BaseLanguageModel): The LLM to use.
|
||||
prompt (PromptTemplate, optional): The prompt to use.
|
||||
criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
|
||||
""" # noqa: E501
|
||||
expected_input_vars = {
|
||||
"prediction",
|
||||
"input",
|
||||
"reference",
|
||||
"criteria",
|
||||
}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
f"but got {prompt_.input_variables}"
|
||||
)
|
||||
criteria_ = resolve_pairwise_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
54
libs/langchain/langchain/evaluation/scoring/prompt.py
Normal file
54
libs/langchain/langchain/evaluation/scoring/prompt.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Prompts for comparing the outputs of two models for a given question.
|
||||
|
||||
This prompt is used to compare two responses and evaluate which one best follows the instructions
|
||||
and answers the question. The prompt is based on the paper from
|
||||
Zheng, et. al. https://arxiv.org/abs/2306.05685
|
||||
"""
|
||||
# flake8: noqa
|
||||
from langchain.prompts.chat import ChatPromptTemplate
|
||||
|
||||
SYSTEM_MESSAGE = 'You are a helpful assistant.'
|
||||
|
||||
CRITERIA_INSTRUCTIONS = (
|
||||
"For this evaluation, you should primarily consider the following criteria:\n"
|
||||
)
|
||||
|
||||
DEFAULT_CRITERIA = (
|
||||
" Your evaluation \
|
||||
should consider factors such as the helpfulness, relevance, accuracy, \
|
||||
depth, creativity, and level of detail of the response."
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"[Instruction]\nPlease act as an impartial judge \
|
||||
and evaluate the quality of the response provided by an AI \
|
||||
assistant to the user question displayed below. {criteria}Begin your evaluation \
|
||||
by providing a short explanation. Be as objective as possible. \
|
||||
After providing your explanation, you must rate the response on a scale of 1 to 10 \
|
||||
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
|
||||
[The End of Assistant's Answer]",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"[Instruction]\nPlease act as an impartial judge \
|
||||
and evaluate the quality of the response provided by an AI \
|
||||
assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
|
||||
by providing a short explanation. Be as objective as possible. \
|
||||
After providing your explanation, you must rate the response on a scale of 1 to 10 \
|
||||
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
|
||||
[The End of Assistant's Answer]",
|
||||
),
|
||||
]
|
||||
)
|
||||
Reference in New Issue
Block a user