mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-21 14:43:07 +00:00
doc string changes
This commit is contained in:
@@ -159,7 +159,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
|
||||
Example:
|
||||
>>> from langchain.chat_models import ChatOpenAI
|
||||
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
|
||||
@@ -22,12 +22,15 @@ from langchain.evaluation.parsing.base import (
|
||||
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
||||
from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
|
||||
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
|
||||
from langchain.evaluation.scoring.eval_chain import (
|
||||
LabeledScoreStringEvalChain,
|
||||
ScoreStringEvalChain,
|
||||
)
|
||||
from langchain.evaluation.string_distance.base import (
|
||||
PairwiseStringDistanceEvalChain,
|
||||
StringDistanceEvalChain,
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain
|
||||
|
||||
|
||||
def load_dataset(uri: str) -> List[Dict]:
|
||||
@@ -71,9 +74,9 @@ _EVALUATOR_MAP: Dict[
|
||||
EvaluatorType.COT_QA: CotQAEvalChain,
|
||||
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
|
||||
EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
|
||||
EvaluatorType.SCORED_STRING: ScoreStringEvalChain,
|
||||
EvaluatorType.SCORE_STRING: ScoreStringEvalChain,
|
||||
EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
|
||||
EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain,
|
||||
EvaluatorType.LABELED_SCORE_STRING: LabeledScoreStringEvalChain,
|
||||
EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
|
||||
EvaluatorType.CRITERIA: CriteriaEvalChain,
|
||||
EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
|
||||
|
||||
@@ -31,12 +31,13 @@ class EvaluatorType(str, Enum):
|
||||
PAIRWISE_STRING = "pairwise_string"
|
||||
"""The pairwise string evaluator, which predicts the preferred prediction from
|
||||
between two models."""
|
||||
SCORED_STRING = "scored_string"
|
||||
"""The scored string evaluator, which gives a score between 1 and 10 to a prediction."""
|
||||
SCORE_STRING = "scored_string"
|
||||
"""The scored string evaluator, which gives a score between 1 and 10
|
||||
to a prediction."""
|
||||
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
|
||||
"""The labeled pairwise string evaluator, which predicts the preferred prediction
|
||||
from between two models based on a ground truth reference label."""
|
||||
LABELED_SCORED_STRING = "labeled_scored_string"
|
||||
LABELED_SCORE_STRING = "labeled_scored_string"
|
||||
"""The labeled scored string evaluator, which gives a score between 1 and 10
|
||||
to a prediction based on a ground truth reference label."""
|
||||
AGENT_TRAJECTORY = "trajectory"
|
||||
|
||||
@@ -6,29 +6,25 @@ criteria and or a reference answer.
|
||||
|
||||
Example:
|
||||
>>> from langchain.chat_models import ChatOpenAI
|
||||
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> from langchain.evaluation.scoring import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
|
||||
>>> chain = ScoreStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result["text"])
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\\n[[B]]"
|
||||
# "score": 8,
|
||||
# "comment": "The response accurately states "
|
||||
# "that the chemical formula for water is H2O."
|
||||
# "However, it does not provide an explanation of what the formula means."
|
||||
# }
|
||||
"""
|
||||
from langchain.evaluation.comparison.eval_chain import (
|
||||
LabeledPairwiseStringEvalChain,
|
||||
PairwiseStringEvalChain,
|
||||
from langchain.evaluation.scoring.eval_chain import (
|
||||
LabeledScoreStringEvalChain,
|
||||
ScoreStringEvalChain,
|
||||
)
|
||||
|
||||
__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]
|
||||
__all__ = ["ScoreStringEvalChain", "LabeledScoreStringEvalChain"]
|
||||
|
||||
@@ -8,17 +8,19 @@ from typing import Any, Dict, List, Optional, Union
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models.azure_openai import AzureChatOpenAI
|
||||
from langchain.chat_models.openai import ChatOpenAI
|
||||
from langchain.evaluation.comparison.prompt import (
|
||||
COMPARISON_TEMPLATE,
|
||||
COMPARISON_TEMPLATE_WITH_REFERENCE,
|
||||
CRITERIA_INSTRUCTIONS,
|
||||
)
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CRITERIA_TYPE,
|
||||
Criteria,
|
||||
)
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.evaluation.scoring.prompt import (
|
||||
CRITERIA_INSTRUCTIONS,
|
||||
DEFAULT_CRITERIA,
|
||||
SCORING_TEMPLATE,
|
||||
SCORING_TEMPLATE_WITH_REFERENCE,
|
||||
)
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.pydantic_v1 import Extra, Field
|
||||
from langchain.schema import RUN_KEY, BaseOutputParser
|
||||
@@ -93,7 +95,7 @@ def resolve_pairwise_criteria(
|
||||
|
||||
|
||||
class ScoreStringResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the PairwiseStringEvalChain.
|
||||
"""A parser for the output of the ScoreStringEvalChain.
|
||||
|
||||
Attributes:
|
||||
_type (str): The type of the output parser.
|
||||
@@ -134,7 +136,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]):
|
||||
"Output must contain a double bracketed string\
|
||||
with the verdict between 1 and 10."
|
||||
)
|
||||
|
||||
|
||||
return {
|
||||
"reasoning": text,
|
||||
"score": int(verdict),
|
||||
@@ -149,24 +151,20 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
|
||||
Example:
|
||||
>>> from langchain.chat_models import ChatOpenAI
|
||||
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0)
|
||||
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||
>>> from langchain.evaluation.scoring import PairwiseStringEvalChain
|
||||
>>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
|
||||
>>> chain = ScoreStringEvalChain.from_llm(llm=llm)
|
||||
>>> result = chain.evaluate_string_pairs(
|
||||
... input = "What is the chemical formula for water?",
|
||||
... prediction = "H2O",
|
||||
... prediction_b = (
|
||||
... "The chemical formula for water is H2O, which means"
|
||||
... " there are two hydrogen atoms and one oxygen atom."
|
||||
... reference = "The chemical formula for water is H2O.",
|
||||
... )
|
||||
>>> print(result["text"])
|
||||
# {
|
||||
# "value": "B",
|
||||
# "comment": "Both responses accurately state"
|
||||
# " that the chemical formula for water is H2O."
|
||||
# " However, Response B provides additional information"
|
||||
# . " by explaining what the formula means.\\n[[B]]"
|
||||
# "score": 8,
|
||||
# "comment": "The response accurately states "
|
||||
# "that the chemical formula for water is H2O."
|
||||
# "However, it does not provide an explanation of what the formula means."
|
||||
# }
|
||||
|
||||
"""
|
||||
@@ -177,7 +175,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
|
||||
class Config:
|
||||
"""Configuration for the PairwiseStringEvalChain."""
|
||||
"""Configuration for the ScoreStringEvalChain."""
|
||||
|
||||
extra = Extra.ignore
|
||||
|
||||
@@ -211,8 +209,8 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
"""
|
||||
return (
|
||||
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
|
||||
"\nTo use a reference, use the LabeledScoringStringEvalChain instead."
|
||||
" (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
|
||||
"\nTo use a reference, use the LabeledScoreStringEvalChain instead."
|
||||
" (EvaluatorType.LABELED_SCORE_STRING) instead."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -224,7 +222,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> ScoreStringEvalChain:
|
||||
"""Initialize the PairwiseStringEvalChain from an LLM.
|
||||
"""Initialize the ScoreStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm (BaseChatModel): The LLM to use (GPT-4 recommended).
|
||||
@@ -238,14 +236,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
ValueError: If the input variables are not as expected.
|
||||
|
||||
"""
|
||||
if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")):
|
||||
if not (
|
||||
isinstance(llm, (ChatOpenAI, AzureChatOpenAI))
|
||||
and llm.model_name.startswith("gpt-4")
|
||||
):
|
||||
logger.warning(
|
||||
"This chain was only tested with GPT-4. \
|
||||
Performance may be significantly worse with other models."
|
||||
)
|
||||
|
||||
expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
|
||||
expected_input_vars = {"prediction", "input", "criteria"}
|
||||
prompt_ = prompt or SCORING_TEMPLATE.partial(reference="")
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
@@ -253,7 +254,9 @@ Performance may be significantly worse with other models."
|
||||
)
|
||||
criteria_ = resolve_pairwise_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
|
||||
criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
|
||||
criteria_str = (
|
||||
CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA
|
||||
)
|
||||
return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
|
||||
|
||||
def _prepare_input(
|
||||
@@ -289,11 +292,10 @@ Performance may be significantly worse with other models."
|
||||
parsed[RUN_KEY] = result[RUN_KEY]
|
||||
return parsed
|
||||
|
||||
def _evaluate_string_pairs(
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
@@ -302,7 +304,7 @@ Performance may be significantly worse with other models."
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Evaluate whether output A is preferred to output B.
|
||||
"""Score the output string.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
@@ -317,7 +319,7 @@ Performance may be significantly worse with other models."
|
||||
- score: A score between 1 and 10.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = self(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
@@ -331,7 +333,6 @@ Performance may be significantly worse with other models."
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
reference: Optional[str] = None,
|
||||
input: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
@@ -340,11 +341,10 @@ Performance may be significantly worse with other models."
|
||||
include_run_info: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||
"""Asynchronously score the output string.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str, optional): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
@@ -356,7 +356,7 @@ Performance may be significantly worse with other models."
|
||||
- score: A score between 1 and 10.
|
||||
|
||||
"""
|
||||
input_ = self._prepare_input(prediction, prediction_b, input, reference)
|
||||
input_ = self._prepare_input(prediction, input, reference)
|
||||
result = await self.acall(
|
||||
inputs=input_,
|
||||
callbacks=callbacks,
|
||||
@@ -367,10 +367,8 @@ Performance may be significantly worse with other models."
|
||||
return self._prepare_output(result)
|
||||
|
||||
|
||||
class LabeledScoringStringEvalChain(ScoreStringEvalChain):
|
||||
"""A chain for comparing two outputs, such as the outputs
|
||||
of two models, prompts, or outputs of a single model on similar inputs,
|
||||
with labeled preferences.
|
||||
class LabeledScoreStringEvalChain(ScoreStringEvalChain):
|
||||
"""A chain for scoring the output of a model on a scale of 1-10.
|
||||
|
||||
Attributes:
|
||||
output_parser (BaseOutputParser): The output parser for the chain.
|
||||
@@ -395,8 +393,8 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> LabeledScoringStringEvalChain:
|
||||
"""Initialize the LabeledPairwiseStringEvalChain from an LLM.
|
||||
) -> LabeledScoreStringEvalChain:
|
||||
"""Initialize the LabeledScoreStringEvalChain from an LLM.
|
||||
|
||||
Args:
|
||||
llm (BaseLanguageModel): The LLM to use.
|
||||
@@ -405,7 +403,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
|
||||
LabeledScoreStringEvalChain: The initialized LabeledScoreStringEvalChain.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input variables are not as expected.
|
||||
@@ -417,7 +415,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
|
||||
"reference",
|
||||
"criteria",
|
||||
}
|
||||
prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
|
||||
prompt_ = prompt or SCORING_TEMPLATE_WITH_REFERENCE
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
|
||||
@@ -1,54 +1,52 @@
|
||||
"""Prompts for comparing the outputs of two models for a given question.
|
||||
"""Prompts for scoring the outputs of a models for a given question.
|
||||
|
||||
This prompt is used to compare two responses and evaluate which one best follows the instructions
|
||||
This prompt is used to socre the responses and evaluate how it follows the instructions
|
||||
and answers the question. The prompt is based on the paper from
|
||||
Zheng, et. al. https://arxiv.org/abs/2306.05685
|
||||
"""
|
||||
# flake8: noqa
|
||||
from langchain.prompts.chat import ChatPromptTemplate
|
||||
|
||||
SYSTEM_MESSAGE = 'You are a helpful assistant.'
|
||||
SYSTEM_MESSAGE = "You are a helpful assistant."
|
||||
|
||||
CRITERIA_INSTRUCTIONS = (
|
||||
"For this evaluation, you should primarily consider the following criteria:\n"
|
||||
)
|
||||
|
||||
DEFAULT_CRITERIA = (
|
||||
" Your evaluation \
|
||||
DEFAULT_CRITERIA = " Your evaluation \
|
||||
should consider factors such as the helpfulness, relevance, accuracy, \
|
||||
depth, creativity, and level of detail of the response."
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
SCORING_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"[Instruction]\nPlease act as an impartial judge \
|
||||
'[Instruction]\nPlease act as an impartial judge \
|
||||
and evaluate the quality of the response provided by an AI \
|
||||
assistant to the user question displayed below. {criteria}Begin your evaluation \
|
||||
by providing a short explanation. Be as objective as possible. \
|
||||
After providing your explanation, you must rate the response on a scale of 1 to 10 \
|
||||
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
|
||||
[The End of Assistant's Answer]",
|
||||
by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\
|
||||
[The End of Assistant\'s Answer]',
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
|
||||
SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
|
||||
[
|
||||
("system", SYSTEM_MESSAGE),
|
||||
(
|
||||
"human",
|
||||
"[Instruction]\nPlease act as an impartial judge \
|
||||
'[Instruction]\nPlease act as an impartial judge \
|
||||
and evaluate the quality of the response provided by an AI \
|
||||
assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
|
||||
by providing a short explanation. Be as objective as possible. \
|
||||
After providing your explanation, you must rate the response on a scale of 1 to 10 \
|
||||
by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
|
||||
[The End of Assistant's Answer]",
|
||||
by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
|
||||
[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\
|
||||
[The End of Assistant\'s Answer]',
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user