From 7edcb50aa21049d0f23e42c004f05c033cd852ed Mon Sep 17 00:00:00 2001 From: CG80499 Date: Wed, 27 Sep 2023 13:58:27 +0000 Subject: [PATCH] doc string changes --- .../evaluation/comparison/eval_chain.py | 2 +- .../langchain/langchain/evaluation/loading.py | 9 +- libs/langchain/langchain/evaluation/schema.py | 7 +- .../langchain/evaluation/scoring/__init__.py | 26 +++--- .../evaluation/scoring/eval_chain.py | 82 +++++++++---------- .../langchain/evaluation/scoring/prompt.py | 30 ++++--- 6 files changed, 76 insertions(+), 80 deletions(-) diff --git a/libs/langchain/langchain/evaluation/comparison/eval_chain.py b/libs/langchain/langchain/evaluation/comparison/eval_chain.py index 0825417ea9c..dec14f0ba39 100644 --- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py +++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py @@ -159,7 +159,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): Example: >>> from langchain.chat_models import ChatOpenAI >>> from langchain.evaluation.comparison import PairwiseStringEvalChain - >>> llm = ChatOpenAI(temperature=0) + >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4") >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_string_pairs( ... input = "What is the chemical formula for water?", diff --git a/libs/langchain/langchain/evaluation/loading.py b/libs/langchain/langchain/evaluation/loading.py index b673810c031..21aacaf6148 100644 --- a/libs/langchain/langchain/evaluation/loading.py +++ b/libs/langchain/langchain/evaluation/loading.py @@ -22,12 +22,15 @@ from langchain.evaluation.parsing.base import ( from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator +from langchain.evaluation.scoring.eval_chain import ( + LabeledScoreStringEvalChain, + ScoreStringEvalChain, +) from langchain.evaluation.string_distance.base import ( PairwiseStringDistanceEvalChain, StringDistanceEvalChain, ) from langchain.schema.language_model import BaseLanguageModel -from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain def load_dataset(uri: str) -> List[Dict]: @@ -71,9 +74,9 @@ _EVALUATOR_MAP: Dict[ EvaluatorType.COT_QA: CotQAEvalChain, EvaluatorType.CONTEXT_QA: ContextQAEvalChain, EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain, - EvaluatorType.SCORED_STRING: ScoreStringEvalChain, + EvaluatorType.SCORE_STRING: ScoreStringEvalChain, EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain, - EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain, + EvaluatorType.LABELED_SCORE_STRING: LabeledScoreStringEvalChain, EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain, EvaluatorType.CRITERIA: CriteriaEvalChain, EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain, diff --git a/libs/langchain/langchain/evaluation/schema.py b/libs/langchain/langchain/evaluation/schema.py index 54ecf03490d..b7de056bb83 100644 --- a/libs/langchain/langchain/evaluation/schema.py +++ b/libs/langchain/langchain/evaluation/schema.py @@ -31,12 +31,13 @@ class EvaluatorType(str, Enum): PAIRWISE_STRING = "pairwise_string" """The pairwise string evaluator, which predicts the preferred prediction from between two models.""" - SCORED_STRING = "scored_string" - """The scored string evaluator, which gives a score between 1 and 10 to a prediction.""" + SCORE_STRING = "scored_string" + """The scored string evaluator, which gives a score between 1 and 10 + to a prediction.""" LABELED_PAIRWISE_STRING = "labeled_pairwise_string" """The labeled pairwise string evaluator, which predicts the preferred prediction from between two models based on a ground truth reference label.""" - LABELED_SCORED_STRING = "labeled_scored_string" + LABELED_SCORE_STRING = "labeled_scored_string" """The labeled scored string evaluator, which gives a score between 1 and 10 to a prediction based on a ground truth reference label.""" AGENT_TRAJECTORY = "trajectory" diff --git a/libs/langchain/langchain/evaluation/scoring/__init__.py b/libs/langchain/langchain/evaluation/scoring/__init__.py index 92e26075534..5ded9edfed1 100644 --- a/libs/langchain/langchain/evaluation/scoring/__init__.py +++ b/libs/langchain/langchain/evaluation/scoring/__init__.py @@ -6,29 +6,25 @@ criteria and or a reference answer. Example: >>> from langchain.chat_models import ChatOpenAI - >>> from langchain.evaluation.comparison import PairwiseStringEvalChain - >>> llm = ChatOpenAI(temperature=0) - >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) + >>> from langchain.evaluation.scoring import PairwiseStringEvalChain + >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4") + >>> chain = ScoreStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_string_pairs( ... input = "What is the chemical formula for water?", ... prediction = "H2O", - ... prediction_b = ( - ... "The chemical formula for water is H2O, which means" - ... " there are two hydrogen atoms and one oxygen atom." ... reference = "The chemical formula for water is H2O.", ... ) >>> print(result["text"]) # { - # "value": "B", - # "comment": "Both responses accurately state" - # " that the chemical formula for water is H2O." - # " However, Response B provides additional information" - # . " by explaining what the formula means.\\n[[B]]" + # "score": 8, + # "comment": "The response accurately states " + # "that the chemical formula for water is H2O." + # "However, it does not provide an explanation of what the formula means." # } """ -from langchain.evaluation.comparison.eval_chain import ( - LabeledPairwiseStringEvalChain, - PairwiseStringEvalChain, +from langchain.evaluation.scoring.eval_chain import ( + LabeledScoreStringEvalChain, + ScoreStringEvalChain, ) -__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"] +__all__ = ["ScoreStringEvalChain", "LabeledScoreStringEvalChain"] diff --git a/libs/langchain/langchain/evaluation/scoring/eval_chain.py b/libs/langchain/langchain/evaluation/scoring/eval_chain.py index cd83cb4d18c..cf20e100cce 100644 --- a/libs/langchain/langchain/evaluation/scoring/eval_chain.py +++ b/libs/langchain/langchain/evaluation/scoring/eval_chain.py @@ -8,17 +8,19 @@ from typing import Any, Dict, List, Optional, Union from langchain.callbacks.manager import Callbacks from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple from langchain.chains.llm import LLMChain +from langchain.chat_models.azure_openai import AzureChatOpenAI from langchain.chat_models.openai import ChatOpenAI -from langchain.evaluation.comparison.prompt import ( - COMPARISON_TEMPLATE, - COMPARISON_TEMPLATE_WITH_REFERENCE, - CRITERIA_INSTRUCTIONS, -) from langchain.evaluation.criteria.eval_chain import ( CRITERIA_TYPE, Criteria, ) from langchain.evaluation.schema import LLMEvalChain, StringEvaluator +from langchain.evaluation.scoring.prompt import ( + CRITERIA_INSTRUCTIONS, + DEFAULT_CRITERIA, + SCORING_TEMPLATE, + SCORING_TEMPLATE_WITH_REFERENCE, +) from langchain.prompts.prompt import PromptTemplate from langchain.pydantic_v1 import Extra, Field from langchain.schema import RUN_KEY, BaseOutputParser @@ -93,7 +95,7 @@ def resolve_pairwise_criteria( class ScoreStringResultOutputParser(BaseOutputParser[dict]): - """A parser for the output of the PairwiseStringEvalChain. + """A parser for the output of the ScoreStringEvalChain. Attributes: _type (str): The type of the output parser. @@ -134,7 +136,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]): "Output must contain a double bracketed string\ with the verdict between 1 and 10." ) - + return { "reasoning": text, "score": int(verdict), @@ -149,24 +151,20 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain): Example: >>> from langchain.chat_models import ChatOpenAI - >>> from langchain.evaluation.comparison import PairwiseStringEvalChain - >>> llm = ChatOpenAI(temperature=0) - >>> chain = PairwiseStringEvalChain.from_llm(llm=llm) + >>> from langchain.evaluation.scoring import PairwiseStringEvalChain + >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4") + >>> chain = ScoreStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_string_pairs( ... input = "What is the chemical formula for water?", ... prediction = "H2O", - ... prediction_b = ( - ... "The chemical formula for water is H2O, which means" - ... " there are two hydrogen atoms and one oxygen atom." ... reference = "The chemical formula for water is H2O.", ... ) >>> print(result["text"]) # { - # "value": "B", - # "comment": "Both responses accurately state" - # " that the chemical formula for water is H2O." - # " However, Response B provides additional information" - # . " by explaining what the formula means.\\n[[B]]" + # "score": 8, + # "comment": "The response accurately states " + # "that the chemical formula for water is H2O." + # "However, it does not provide an explanation of what the formula means." # } """ @@ -177,7 +175,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain): ) class Config: - """Configuration for the PairwiseStringEvalChain.""" + """Configuration for the ScoreStringEvalChain.""" extra = Extra.ignore @@ -211,8 +209,8 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain): """ return ( f"Ignoring reference in {self.__class__.__name__}, as it is not expected." - "\nTo use a reference, use the LabeledScoringStringEvalChain instead." - " (EvaluatorType.LABELED_PAIRWISE_STRING) instead." + "\nTo use a reference, use the LabeledScoreStringEvalChain instead." + " (EvaluatorType.LABELED_SCORE_STRING) instead." ) @classmethod @@ -224,7 +222,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain): criteria: Optional[Union[CRITERIA_TYPE, str]] = None, **kwargs: Any, ) -> ScoreStringEvalChain: - """Initialize the PairwiseStringEvalChain from an LLM. + """Initialize the ScoreStringEvalChain from an LLM. Args: llm (BaseChatModel): The LLM to use (GPT-4 recommended). @@ -238,14 +236,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain): ValueError: If the input variables are not as expected. """ - if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")): + if not ( + isinstance(llm, (ChatOpenAI, AzureChatOpenAI)) + and llm.model_name.startswith("gpt-4") + ): logger.warning( "This chain was only tested with GPT-4. \ Performance may be significantly worse with other models." ) - expected_input_vars = {"prediction", "prediction_b", "input", "criteria"} - prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="") + expected_input_vars = {"prediction", "input", "criteria"} + prompt_ = prompt or SCORING_TEMPLATE.partial(reference="") if expected_input_vars != set(prompt_.input_variables): raise ValueError( f"Input variables should be {expected_input_vars}, " @@ -253,7 +254,9 @@ Performance may be significantly worse with other models." ) criteria_ = resolve_pairwise_criteria(criteria) criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items()) - criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else "" + criteria_str = ( + CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA + ) return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs) def _prepare_input( @@ -289,11 +292,10 @@ Performance may be significantly worse with other models." parsed[RUN_KEY] = result[RUN_KEY] return parsed - def _evaluate_string_pairs( + def _evaluate_strings( self, *, prediction: str, - prediction_b: str, input: Optional[str] = None, reference: Optional[str] = None, callbacks: Callbacks = None, @@ -302,7 +304,7 @@ Performance may be significantly worse with other models." include_run_info: bool = False, **kwargs: Any, ) -> dict: - """Evaluate whether output A is preferred to output B. + """Score the output string. Args: prediction (str): The output string from the first model. @@ -317,7 +319,7 @@ Performance may be significantly worse with other models." - score: A score between 1 and 10. """ - input_ = self._prepare_input(prediction, prediction_b, input, reference) + input_ = self._prepare_input(prediction, input, reference) result = self( inputs=input_, callbacks=callbacks, @@ -331,7 +333,6 @@ Performance may be significantly worse with other models." self, *, prediction: str, - prediction_b: str, reference: Optional[str] = None, input: Optional[str] = None, callbacks: Callbacks = None, @@ -340,11 +341,10 @@ Performance may be significantly worse with other models." include_run_info: bool = False, **kwargs: Any, ) -> dict: - """Asynchronously evaluate whether output A is preferred to output B. + """Asynchronously score the output string. Args: prediction (str): The output string from the first model. - prediction_b (str): The output string from the second model. input (str, optional): The input or task string. callbacks (Callbacks, optional): The callbacks to use. reference (str, optional): The reference string, if any. @@ -356,7 +356,7 @@ Performance may be significantly worse with other models." - score: A score between 1 and 10. """ - input_ = self._prepare_input(prediction, prediction_b, input, reference) + input_ = self._prepare_input(prediction, input, reference) result = await self.acall( inputs=input_, callbacks=callbacks, @@ -367,10 +367,8 @@ Performance may be significantly worse with other models." return self._prepare_output(result) -class LabeledScoringStringEvalChain(ScoreStringEvalChain): - """A chain for comparing two outputs, such as the outputs - of two models, prompts, or outputs of a single model on similar inputs, - with labeled preferences. +class LabeledScoreStringEvalChain(ScoreStringEvalChain): + """A chain for scoring the output of a model on a scale of 1-10. Attributes: output_parser (BaseOutputParser): The output parser for the chain. @@ -395,8 +393,8 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain): prompt: Optional[PromptTemplate] = None, criteria: Optional[Union[CRITERIA_TYPE, str]] = None, **kwargs: Any, - ) -> LabeledScoringStringEvalChain: - """Initialize the LabeledPairwiseStringEvalChain from an LLM. + ) -> LabeledScoreStringEvalChain: + """Initialize the LabeledScoreStringEvalChain from an LLM. Args: llm (BaseLanguageModel): The LLM to use. @@ -405,7 +403,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain): **kwargs (Any): Additional keyword arguments. Returns: - LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain. + LabeledScoreStringEvalChain: The initialized LabeledScoreStringEvalChain. Raises: ValueError: If the input variables are not as expected. @@ -417,7 +415,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain): "reference", "criteria", } - prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE + prompt_ = prompt or SCORING_TEMPLATE_WITH_REFERENCE if expected_input_vars != set(prompt_.input_variables): raise ValueError( f"Input variables should be {expected_input_vars}, " diff --git a/libs/langchain/langchain/evaluation/scoring/prompt.py b/libs/langchain/langchain/evaluation/scoring/prompt.py index 08eea69dd95..972ff142c35 100644 --- a/libs/langchain/langchain/evaluation/scoring/prompt.py +++ b/libs/langchain/langchain/evaluation/scoring/prompt.py @@ -1,54 +1,52 @@ -"""Prompts for comparing the outputs of two models for a given question. +"""Prompts for scoring the outputs of a models for a given question. -This prompt is used to compare two responses and evaluate which one best follows the instructions +This prompt is used to socre the responses and evaluate how it follows the instructions and answers the question. The prompt is based on the paper from Zheng, et. al. https://arxiv.org/abs/2306.05685 """ # flake8: noqa from langchain.prompts.chat import ChatPromptTemplate -SYSTEM_MESSAGE = 'You are a helpful assistant.' +SYSTEM_MESSAGE = "You are a helpful assistant." CRITERIA_INSTRUCTIONS = ( "For this evaluation, you should primarily consider the following criteria:\n" ) -DEFAULT_CRITERIA = ( - " Your evaluation \ +DEFAULT_CRITERIA = " Your evaluation \ should consider factors such as the helpfulness, relevance, accuracy, \ depth, creativity, and level of detail of the response." -) -COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages( +SCORING_TEMPLATE = ChatPromptTemplate.from_messages( [ ("system", SYSTEM_MESSAGE), ( "human", - "[Instruction]\nPlease act as an impartial judge \ + '[Instruction]\nPlease act as an impartial judge \ and evaluate the quality of the response provided by an AI \ assistant to the user question displayed below. {criteria}Begin your evaluation \ by providing a short explanation. Be as objective as possible. \ After providing your explanation, you must rate the response on a scale of 1 to 10 \ -by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\ -[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\ -[The End of Assistant's Answer]", +by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\ +[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\ +[The End of Assistant\'s Answer]', ), ] ) -COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages( +SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages( [ ("system", SYSTEM_MESSAGE), ( "human", - "[Instruction]\nPlease act as an impartial judge \ + '[Instruction]\nPlease act as an impartial judge \ and evaluate the quality of the response provided by an AI \ assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \ by providing a short explanation. Be as objective as possible. \ After providing your explanation, you must rate the response on a scale of 1 to 10 \ -by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\ -[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\ -[The End of Assistant's Answer]", +by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\ +[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\ +[The End of Assistant\'s Answer]', ), ] )