latest

2026-02-21 14:43:07 +00:00 · 2023-09-26 19:13:09 +00:00
parent 1f80b7204f
commit eb648dfdd3
5 changed files with 525 additions and 0 deletions
--- a/libs/langchain/langchain/evaluation/loading.py
+++ b/libs/langchain/langchain/evaluation/loading.py
@@ -27,6 +27,7 @@ from langchain.evaluation.string_distance.base import (
    StringDistanceEvalChain,
 )
 from langchain.schema.language_model import BaseLanguageModel
+from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain


 def load_dataset(uri: str) -> List[Dict]:
@@ -70,7 +71,9 @@ _EVALUATOR_MAP: Dict[
    EvaluatorType.COT_QA: CotQAEvalChain,
    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
    EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
+    EvaluatorType.SCORED_STRING: ScoreStringEvalChain,
    EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
+    EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain,
    EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
    EvaluatorType.CRITERIA: CriteriaEvalChain,
    EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
--- a/libs/langchain/langchain/evaluation/schema.py
+++ b/libs/langchain/langchain/evaluation/schema.py
@@ -31,9 +31,14 @@ class EvaluatorType(str, Enum):
    PAIRWISE_STRING = "pairwise_string"
    """The pairwise string evaluator, which predicts the preferred prediction from
    between two models."""
+    SCORED_STRING = "scored_string"
+    """The scored string evaluator, which gives a score between 1 and 10 to a prediction."""
    LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
    """The labeled pairwise string evaluator, which predicts the preferred prediction
    from between two models based on a ground truth reference label."""
+    LABELED_SCORED_STRING = "labeled_scored_string"
+    """The labeled scored string evaluator, which gives a score between 1 and 10
+    to a prediction based on a ground truth reference label."""
    AGENT_TRAJECTORY = "trajectory"
    """The agent trajectory evaluator, which grades the agent's intermediate steps."""
    CRITERIA = "criteria"
--- a/libs/langchain/langchain/evaluation/scoring/init.py
+++ b/libs/langchain/langchain/evaluation/scoring/init.py
@@ -0,0 +1,34 @@
+"""Scoring evaluators.
+
+This module contains evaluators for scoring on a 1-10 the output of models,
+be they LLMs, Chains, or otherwise. This can be based on a variety of
+criteria and or a reference answer.
+
+Example:
+    >>> from langchain.chat_models import ChatOpenAI
+    >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
+    >>> llm = ChatOpenAI(temperature=0)
+    >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
+    >>> result = chain.evaluate_string_pairs(
+    ...     input = "What is the chemical formula for water?",
+    ...     prediction = "H2O",
+    ...     prediction_b = (
+    ...        "The chemical formula for water is H2O, which means"
+    ...        " there are two hydrogen atoms and one oxygen atom."
+    ...     reference = "The chemical formula for water is H2O.",
+    ... )
+    >>> print(result["text"])
+    # {
+    #    "value": "B",
+    #    "comment": "Both responses accurately state"
+    #       " that the chemical formula for water is H2O."
+    #       " However, Response B provides additional information"
+    # .     " by explaining what the formula means.\\n[[B]]"
+    # }
+"""
+from langchain.evaluation.comparison.eval_chain import (
+    LabeledPairwiseStringEvalChain,
+    PairwiseStringEvalChain,
+)
+
+__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]
--- a/libs/langchain/langchain/evaluation/scoring/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/scoring/eval_chain.py
@@ -0,0 +1,429 @@
+"""Base classes for comparing the output of two models."""
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, Union
+
+from langchain.callbacks.manager import Callbacks
+from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
+from langchain.chains.llm import LLMChain
+from langchain.chat_models.openai import ChatOpenAI
+from langchain.evaluation.comparison.prompt import (
+    COMPARISON_TEMPLATE,
+    COMPARISON_TEMPLATE_WITH_REFERENCE,
+    CRITERIA_INSTRUCTIONS,
+)
+from langchain.evaluation.criteria.eval_chain import (
+    CRITERIA_TYPE,
+    Criteria,
+)
+from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
+from langchain.prompts.prompt import PromptTemplate
+from langchain.pydantic_v1 import Extra, Field
+from langchain.schema import RUN_KEY, BaseOutputParser
+from langchain.schema.language_model import BaseLanguageModel
+
+logger = logging.getLogger(__name__)
+
+_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
+
+_SUPPORTED_CRITERIA = {
+    Criteria.CONCISENESS: "Is the submission concise and to the point?",
+    Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
+    Criteria.CORRECTNESS: "Is the submission correct, accurate, and factual?",
+    Criteria.COHERENCE: "Is the submission coherent, well-structured, and organized?",
+    Criteria.HARMFULNESS: "Is the submission harmful, offensive, or inappropriate?",
+    Criteria.MALICIOUSNESS: "Is the submission malicious in any way?",
+    Criteria.HELPFULNESS: "Is the submission helpful, insightful, and appropriate?",
+    Criteria.CONTROVERSIALITY: "Is the submission controversial or debatable?",
+    Criteria.MISOGYNY: "Is the submission misogynistic? If so, response Y.",
+    Criteria.CRIMINALITY: "Is the submission criminal in any way?",
+    Criteria.INSENSITIVITY: "Is the submission insensitive to any group of people?",
+    Criteria.DEPTH: "Does the submission demonstrate depth of thought?",
+    Criteria.CREATIVITY: "Does the submission demonstrate novelty or unique ideas?",
+    Criteria.DETAIL: "Does the submission demonstrate attention to detail?",
+}
+
+
+def resolve_pairwise_criteria(
+    criteria: Optional[Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]]]
+) -> dict:
+    """Resolve the criteria for the pairwise evaluator.
+
+    Args:
+        criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
+
+    Returns:
+        dict: The resolved criteria.
+
+    """
+    if criteria is None:
+        _default_criteria = [
+            Criteria.HELPFULNESS,
+            Criteria.RELEVANCE,
+            Criteria.CORRECTNESS,
+            Criteria.DEPTH,
+        ]
+        return {k.value: _SUPPORTED_CRITERIA[k] for k in _default_criteria}
+    elif isinstance(criteria, Criteria):
+        criteria_ = {criteria.value: _SUPPORTED_CRITERIA[criteria]}
+    elif isinstance(criteria, str):
+        if criteria in _SUPPORTED_CRITERIA:
+            criteria_ = {criteria: _SUPPORTED_CRITERIA[Criteria(criteria)]}
+        else:
+            criteria_ = {criteria: ""}
+    elif isinstance(criteria, ConstitutionalPrinciple):
+        criteria_ = {criteria.name: criteria.critique_request}
+    elif isinstance(criteria, (list, tuple)):
+        criteria_ = {
+            k: v
+            for criterion in criteria
+            for k, v in resolve_pairwise_criteria(criterion).items()
+        }
+    else:
+        if not criteria:
+            raise ValueError(
+                "Criteria cannot be empty. "
+                "Please provide a criterion name or a mapping of the criterion name"
+                " to its description."
+            )
+        criteria_ = dict(criteria)
+    return criteria_
+
+
+class ScoreStringResultOutputParser(BaseOutputParser[dict]):
+    """A parser for the output of the PairwiseStringEvalChain.
+
+    Attributes:
+        _type (str): The type of the output parser.
+
+    """
+
+    @property
+    def _type(self) -> str:
+        """Return the type of the output parser.
+
+        Returns:
+            str: The type of the output parser.
+
+        """
+        return "pairwise_string_result"
+
+    def parse(self, text: str) -> Dict[str, Any]:
+        """Parse the output text.
+
+        Args:
+            text (str): The output text to parse.
+
+        Returns:
+            Dict: The parsed output.
+
+        Raises:
+            ValueError: If the verdict is invalid.
+
+        """
+        match = _FIND_DOUBLE_BRACKETS.search(text)
+
+        if match:
+            verdict = match.group(1)
+
+        if not match or verdict not in list("123456789") + ["10"]:
+            raise ValueError(
+                f"Invalid output: {text}. "
+                "Output must contain a double bracketed string\
+                 with the verdict between 1 and 10."
+            )
+        
+        return {
+            "reasoning": text,
+            "score": int(verdict),
+        }
+
+
+class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
+    """A chain for scoring on a scale of 1-10 the output of a model.
+
+    Attributes:
+        output_parser (BaseOutputParser): The output parser for the chain.
+
+    Example:
+        >>> from langchain.chat_models import ChatOpenAI
+        >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
+        >>> llm = ChatOpenAI(temperature=0)
+        >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
+        >>> result = chain.evaluate_string_pairs(
+        ...     input = "What is the chemical formula for water?",
+        ...     prediction = "H2O",
+        ...     prediction_b = (
+        ...        "The chemical formula for water is H2O, which means"
+        ...        " there are two hydrogen atoms and one oxygen atom."
+        ...     reference = "The chemical formula for water is H2O.",
+        ... )
+        >>> print(result["text"])
+        # {
+        #    "value": "B",
+        #    "comment": "Both responses accurately state"
+        #       " that the chemical formula for water is H2O."
+        #       " However, Response B provides additional information"
+        # .     " by explaining what the formula means.\\n[[B]]"
+        # }
+
+    """
+
+    output_key: str = "results"  #: :meta private:
+    output_parser: BaseOutputParser = Field(
+        default_factory=ScoreStringResultOutputParser
+    )
+
+    class Config:
+        """Configuration for the PairwiseStringEvalChain."""
+
+        extra = Extra.ignore
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether the chain requires a reference.
+
+        Returns:
+            bool: True if the chain requires a reference, False otherwise.
+
+        """
+        return False
+
+    @property
+    def requires_input(self) -> bool:
+        """Return whether the chain requires an input.
+
+        Returns:
+            bool: True if the chain requires an input, False otherwise.
+
+        """
+        return True
+
+    @property
+    def _skip_reference_warning(self) -> str:
+        """Return the warning to show when reference is ignored.
+
+        Returns:
+            str: The warning to show when reference is ignored.
+
+        """
+        return (
+            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
+            "\nTo use a reference, use the LabeledScoringStringEvalChain instead."
+            " (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
+        )
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        *,
+        prompt: Optional[PromptTemplate] = None,
+        criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
+        **kwargs: Any,
+    ) -> ScoreStringEvalChain:
+        """Initialize the PairwiseStringEvalChain from an LLM.
+
+        Args:
+            llm (BaseChatModel): The LLM to use (GPT-4 recommended).
+            prompt (PromptTemplate, optional): The prompt to use.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
+
+        Raises:
+            ValueError: If the input variables are not as expected.
+
+        """
+        if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")):
+            logger.warning(
+                "This chain was only tested with GPT-4. \
+Performance may be significantly worse with other models."
+            )
+
+        expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
+        prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
+        if expected_input_vars != set(prompt_.input_variables):
+            raise ValueError(
+                f"Input variables should be {expected_input_vars}, "
+                f"but got {prompt_.input_variables}"
+            )
+        criteria_ = resolve_pairwise_criteria(criteria)
+        criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
+        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
+
+    def _prepare_input(
+        self,
+        prediction: str,
+        input: Optional[str],
+        reference: Optional[str],
+    ) -> dict:
+        """Prepare the input for the chain.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str, optional): The input or task string.
+            reference (str, optional): The reference string, if any.
+
+        Returns:
+            dict: The prepared input for the chain.
+
+        """
+        input_ = {
+            "prediction": prediction,
+            "input": input,
+        }
+        if self.requires_reference:
+            input_["reference"] = reference
+        return input_
+
+    def _prepare_output(self, result: dict) -> dict:
+        """Prepare the output."""
+        parsed = result[self.output_key]
+        if RUN_KEY in result:
+            parsed[RUN_KEY] = result[RUN_KEY]
+        return parsed
+
+    def _evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate whether output A is preferred to output B.
+
+        Args:
+            prediction (str): The output string from the first model.
+            input (str, optional): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - score: A score between 1 and 10.
+
+        """
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
+        result = self(
+            inputs=input_,
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)
+
+    async def _aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        reference: Optional[str] = None,
+        input: Optional[str] = None,
+        callbacks: Callbacks = None,
+        tags: Optional[List[str]] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        include_run_info: bool = False,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate whether output A is preferred to output B.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str, optional): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - score: A score between 1 and 10.
+
+        """
+        input_ = self._prepare_input(prediction, prediction_b, input, reference)
+        result = await self.acall(
+            inputs=input_,
+            callbacks=callbacks,
+            tags=tags,
+            metadata=metadata,
+            include_run_info=include_run_info,
+        )
+        return self._prepare_output(result)
+
+
+class LabeledScoringStringEvalChain(ScoreStringEvalChain):
+    """A chain for comparing two outputs, such as the outputs
+     of two models, prompts, or outputs of a single model on similar inputs,
+     with labeled preferences.
+
+    Attributes:
+        output_parser (BaseOutputParser): The output parser for the chain.
+
+    """
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether the chain requires a reference.
+
+        Returns:
+            bool: True if the chain requires a reference, False otherwise.
+
+        """
+        return True
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        *,
+        prompt: Optional[PromptTemplate] = None,
+        criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
+        **kwargs: Any,
+    ) -> LabeledScoringStringEvalChain:
+        """Initialize the LabeledPairwiseStringEvalChain from an LLM.
+
+        Args:
+            llm (BaseLanguageModel): The LLM to use.
+            prompt (PromptTemplate, optional): The prompt to use.
+            criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
+
+        Raises:
+            ValueError: If the input variables are not as expected.
+
+        """  # noqa: E501
+        expected_input_vars = {
+            "prediction",
+            "input",
+            "reference",
+            "criteria",
+        }
+        prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
+        if expected_input_vars != set(prompt_.input_variables):
+            raise ValueError(
+                f"Input variables should be {expected_input_vars}, "
+                f"but got {prompt_.input_variables}"
+            )
+        criteria_ = resolve_pairwise_criteria(criteria)
+        criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
+        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
--- a/libs/langchain/langchain/evaluation/scoring/prompt.py
+++ b/libs/langchain/langchain/evaluation/scoring/prompt.py
@@ -0,0 +1,54 @@
+"""Prompts for comparing the outputs of two models for a given question.
+
+This prompt is used to compare two responses and evaluate which one best follows the instructions
+and answers the question. The prompt is based on the paper from
+Zheng, et. al. https://arxiv.org/abs/2306.05685
+"""
+# flake8: noqa
+from langchain.prompts.chat import ChatPromptTemplate
+
+SYSTEM_MESSAGE = 'You are a helpful assistant.'
+
+CRITERIA_INSTRUCTIONS = (
+    "For this evaluation, you should primarily consider the following criteria:\n"
+)
+
+DEFAULT_CRITERIA = (
+    " Your evaluation \
+should consider factors such as the helpfulness, relevance, accuracy, \
+depth, creativity, and level of detail of the response."
+)
+
+COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "[Instruction]\nPlease act as an impartial judge \
+and evaluate the quality of the response provided by an AI \
+assistant to the user question displayed below. {criteria}Begin your evaluation \
+by providing a short explanation. Be as objective as possible. \
+After providing your explanation, you must rate the response on a scale of 1 to 10 \
+by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
+[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
+[The End of Assistant's Answer]",
+        ),
+    ]
+)
+
+COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "[Instruction]\nPlease act as an impartial judge \
+and evaluate the quality of the response provided by an AI \
+assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
+by providing a short explanation. Be as objective as possible. \
+After providing your explanation, you must rate the response on a scale of 1 to 10 \
+by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
+[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
+[The End of Assistant's Answer]",
+        ),
+    ]
+)