doc string changes

2026-02-21 14:43:07 +00:00 · 2023-09-27 13:58:27 +00:00
parent eb648dfdd3
commit 7edcb50aa2
6 changed files with 76 additions and 80 deletions
--- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py
@@ -159,7 +159,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
    Example:
        >>> from langchain.chat_models import ChatOpenAI
        >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
-        >>> llm = ChatOpenAI(temperature=0)
+        >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
        >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
        >>> result = chain.evaluate_string_pairs(
        ...     input = "What is the chemical formula for water?",
--- a/libs/langchain/langchain/evaluation/loading.py
+++ b/libs/langchain/langchain/evaluation/loading.py
@@ -22,12 +22,15 @@ from langchain.evaluation.parsing.base import (
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
 from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
 from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
+from langchain.evaluation.scoring.eval_chain import (
+    LabeledScoreStringEvalChain,
+    ScoreStringEvalChain,
+)
 from langchain.evaluation.string_distance.base import (
    PairwiseStringDistanceEvalChain,
    StringDistanceEvalChain,
 )
 from langchain.schema.language_model import BaseLanguageModel
-from langchain.evaluation.scoring.eval_chain import LabeledScoringStringEvalChain, ScoreStringEvalChain


 def load_dataset(uri: str) -> List[Dict]:
@@ -71,9 +74,9 @@ _EVALUATOR_MAP: Dict[
    EvaluatorType.COT_QA: CotQAEvalChain,
    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
    EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
-    EvaluatorType.SCORED_STRING: ScoreStringEvalChain,
+    EvaluatorType.SCORE_STRING: ScoreStringEvalChain,
    EvaluatorType.LABELED_PAIRWISE_STRING: LabeledPairwiseStringEvalChain,
-    EvaluatorType.LABELED_SCORED_STRING: LabeledScoringStringEvalChain,
+    EvaluatorType.LABELED_SCORE_STRING: LabeledScoreStringEvalChain,
    EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
    EvaluatorType.CRITERIA: CriteriaEvalChain,
    EvaluatorType.LABELED_CRITERIA: LabeledCriteriaEvalChain,
--- a/libs/langchain/langchain/evaluation/schema.py
+++ b/libs/langchain/langchain/evaluation/schema.py
@@ -31,12 +31,13 @@ class EvaluatorType(str, Enum):
    PAIRWISE_STRING = "pairwise_string"
    """The pairwise string evaluator, which predicts the preferred prediction from
    between two models."""
-    SCORED_STRING = "scored_string"
-    """The scored string evaluator, which gives a score between 1 and 10 to a prediction."""
+    SCORE_STRING = "scored_string"
+    """The scored string evaluator, which gives a score between 1 and 10 
+    to a prediction."""
    LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
    """The labeled pairwise string evaluator, which predicts the preferred prediction
    from between two models based on a ground truth reference label."""
-    LABELED_SCORED_STRING = "labeled_scored_string"
+    LABELED_SCORE_STRING = "labeled_scored_string"
    """The labeled scored string evaluator, which gives a score between 1 and 10
    to a prediction based on a ground truth reference label."""
    AGENT_TRAJECTORY = "trajectory"
--- a/libs/langchain/langchain/evaluation/scoring/init.py
+++ b/libs/langchain/langchain/evaluation/scoring/init.py
@@ -6,29 +6,25 @@ criteria and or a reference answer.

 Example:
    >>> from langchain.chat_models import ChatOpenAI
-    >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
-    >>> llm = ChatOpenAI(temperature=0)
-    >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
+    >>> from langchain.evaluation.scoring import PairwiseStringEvalChain
+    >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
+    >>> chain = ScoreStringEvalChain.from_llm(llm=llm)
    >>> result = chain.evaluate_string_pairs(
    ...     input = "What is the chemical formula for water?",
    ...     prediction = "H2O",
-    ...     prediction_b = (
-    ...        "The chemical formula for water is H2O, which means"
-    ...        " there are two hydrogen atoms and one oxygen atom."
    ...     reference = "The chemical formula for water is H2O.",
    ... )
    >>> print(result["text"])
    # {
-    #    "value": "B",
-    #    "comment": "Both responses accurately state"
-    #       " that the chemical formula for water is H2O."
-    #       " However, Response B provides additional information"
-    # .     " by explaining what the formula means.\\n[[B]]"
+    #    "score": 8,
+    #    "comment": "The response accurately states "
+    #    "that the chemical formula for water is H2O."
+    #    "However, it does not provide an explanation of what the formula means."
    # }
 """
-from langchain.evaluation.comparison.eval_chain import (
-    LabeledPairwiseStringEvalChain,
-    PairwiseStringEvalChain,
+from langchain.evaluation.scoring.eval_chain import (
+    LabeledScoreStringEvalChain,
+    ScoreStringEvalChain,
 )

-__all__ = ["PairwiseStringEvalChain", "LabeledPairwiseStringEvalChain"]
+__all__ = ["ScoreStringEvalChain", "LabeledScoreStringEvalChain"]
--- a/libs/langchain/langchain/evaluation/scoring/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/scoring/eval_chain.py
@@ -8,17 +8,19 @@ from typing import Any, Dict, List, Optional, Union
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
+from langchain.chat_models.azure_openai import AzureChatOpenAI
 from langchain.chat_models.openai import ChatOpenAI
-from langchain.evaluation.comparison.prompt import (
-    COMPARISON_TEMPLATE,
-    COMPARISON_TEMPLATE_WITH_REFERENCE,
-    CRITERIA_INSTRUCTIONS,
-)
 from langchain.evaluation.criteria.eval_chain import (
    CRITERIA_TYPE,
    Criteria,
 )
 from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
+from langchain.evaluation.scoring.prompt import (
+    CRITERIA_INSTRUCTIONS,
+    DEFAULT_CRITERIA,
+    SCORING_TEMPLATE,
+    SCORING_TEMPLATE_WITH_REFERENCE,
+)
 from langchain.prompts.prompt import PromptTemplate
 from langchain.pydantic_v1 import Extra, Field
 from langchain.schema import RUN_KEY, BaseOutputParser
@@ -93,7 +95,7 @@ def resolve_pairwise_criteria(


 class ScoreStringResultOutputParser(BaseOutputParser[dict]):
-    """A parser for the output of the PairwiseStringEvalChain.
+    """A parser for the output of the ScoreStringEvalChain.

    Attributes:
        _type (str): The type of the output parser.
@@ -134,7 +136,7 @@ class ScoreStringResultOutputParser(BaseOutputParser[dict]):
                "Output must contain a double bracketed string\
                 with the verdict between 1 and 10."
            )
-        
+
        return {
            "reasoning": text,
            "score": int(verdict),
@@ -149,24 +151,20 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

    Example:
        >>> from langchain.chat_models import ChatOpenAI
-        >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
-        >>> llm = ChatOpenAI(temperature=0)
-        >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
+        >>> from langchain.evaluation.scoring import PairwiseStringEvalChain
+        >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4")
+        >>> chain = ScoreStringEvalChain.from_llm(llm=llm)
        >>> result = chain.evaluate_string_pairs(
        ...     input = "What is the chemical formula for water?",
        ...     prediction = "H2O",
-        ...     prediction_b = (
-        ...        "The chemical formula for water is H2O, which means"
-        ...        " there are two hydrogen atoms and one oxygen atom."
        ...     reference = "The chemical formula for water is H2O.",
        ... )
        >>> print(result["text"])
        # {
-        #    "value": "B",
-        #    "comment": "Both responses accurately state"
-        #       " that the chemical formula for water is H2O."
-        #       " However, Response B provides additional information"
-        # .     " by explaining what the formula means.\\n[[B]]"
+        #    "score": 8,
+        #    "comment": "The response accurately states "
+        #    "that the chemical formula for water is H2O."
+        #    "However, it does not provide an explanation of what the formula means."
        # }

    """
@@ -177,7 +175,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
    )

    class Config:
-        """Configuration for the PairwiseStringEvalChain."""
+        """Configuration for the ScoreStringEvalChain."""

        extra = Extra.ignore

@@ -211,8 +209,8 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        """
        return (
            f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
-            "\nTo use a reference, use the LabeledScoringStringEvalChain instead."
-            " (EvaluatorType.LABELED_PAIRWISE_STRING) instead."
+            "\nTo use a reference, use the LabeledScoreStringEvalChain instead."
+            " (EvaluatorType.LABELED_SCORE_STRING) instead."
        )

    @classmethod
@@ -224,7 +222,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
        **kwargs: Any,
    ) -> ScoreStringEvalChain:
-        """Initialize the PairwiseStringEvalChain from an LLM.
+        """Initialize the ScoreStringEvalChain from an LLM.

        Args:
            llm (BaseChatModel): The LLM to use (GPT-4 recommended).
@@ -238,14 +236,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            ValueError: If the input variables are not as expected.

        """
-        if not (isinstance(llm, ChatOpenAI) and llm.model_name.startswith("gpt-4")):
+        if not (
+            isinstance(llm, (ChatOpenAI, AzureChatOpenAI))
+            and llm.model_name.startswith("gpt-4")
+        ):
            logger.warning(
                "This chain was only tested with GPT-4. \
 Performance may be significantly worse with other models."
            )

-        expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
-        prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
+        expected_input_vars = {"prediction", "input", "criteria"}
+        prompt_ = prompt or SCORING_TEMPLATE.partial(reference="")
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@@ -253,7 +254,9 @@ Performance may be significantly worse with other models."
            )
        criteria_ = resolve_pairwise_criteria(criteria)
        criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
-        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
+        criteria_str = (
+            CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA
+        )
        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)

    def _prepare_input(
@@ -289,11 +292,10 @@ Performance may be significantly worse with other models."
            parsed[RUN_KEY] = result[RUN_KEY]
        return parsed

-    def _evaluate_string_pairs(
+    def _evaluate_strings(
        self,
        *,
        prediction: str,
-        prediction_b: str,
        input: Optional[str] = None,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
@@ -302,7 +304,7 @@ Performance may be significantly worse with other models."
        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
-        """Evaluate whether output A is preferred to output B.
+        """Score the output string.

        Args:
            prediction (str): The output string from the first model.
@@ -317,7 +319,7 @@ Performance may be significantly worse with other models."
                - score: A score between 1 and 10.

        """
-        input_ = self._prepare_input(prediction, prediction_b, input, reference)
+        input_ = self._prepare_input(prediction, input, reference)
        result = self(
            inputs=input_,
            callbacks=callbacks,
@@ -331,7 +333,6 @@ Performance may be significantly worse with other models."
        self,
        *,
        prediction: str,
-        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        callbacks: Callbacks = None,
@@ -340,11 +341,10 @@ Performance may be significantly worse with other models."
        include_run_info: bool = False,
        **kwargs: Any,
    ) -> dict:
-        """Asynchronously evaluate whether output A is preferred to output B.
+        """Asynchronously score the output string.

        Args:
            prediction (str): The output string from the first model.
-            prediction_b (str): The output string from the second model.
            input (str, optional): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
@@ -356,7 +356,7 @@ Performance may be significantly worse with other models."
                - score: A score between 1 and 10.

        """
-        input_ = self._prepare_input(prediction, prediction_b, input, reference)
+        input_ = self._prepare_input(prediction, input, reference)
        result = await self.acall(
            inputs=input_,
            callbacks=callbacks,
@@ -367,10 +367,8 @@ Performance may be significantly worse with other models."
        return self._prepare_output(result)


-class LabeledScoringStringEvalChain(ScoreStringEvalChain):
-    """A chain for comparing two outputs, such as the outputs
-     of two models, prompts, or outputs of a single model on similar inputs,
-     with labeled preferences.
+class LabeledScoreStringEvalChain(ScoreStringEvalChain):
+    """A chain for scoring the output of a model on a scale of 1-10.

    Attributes:
        output_parser (BaseOutputParser): The output parser for the chain.
@@ -395,8 +393,8 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
        prompt: Optional[PromptTemplate] = None,
        criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
        **kwargs: Any,
-    ) -> LabeledScoringStringEvalChain:
-        """Initialize the LabeledPairwiseStringEvalChain from an LLM.
+    ) -> LabeledScoreStringEvalChain:
+        """Initialize the LabeledScoreStringEvalChain from an LLM.

        Args:
            llm (BaseLanguageModel): The LLM to use.
@@ -405,7 +403,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
            **kwargs (Any): Additional keyword arguments.

        Returns:
-            LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.
+            LabeledScoreStringEvalChain: The initialized LabeledScoreStringEvalChain.

        Raises:
            ValueError: If the input variables are not as expected.
@@ -417,7 +415,7 @@ class LabeledScoringStringEvalChain(ScoreStringEvalChain):
            "reference",
            "criteria",
        }
-        prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
+        prompt_ = prompt or SCORING_TEMPLATE_WITH_REFERENCE
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
--- a/libs/langchain/langchain/evaluation/scoring/prompt.py
+++ b/libs/langchain/langchain/evaluation/scoring/prompt.py
@@ -1,54 +1,52 @@
-"""Prompts for comparing the outputs of two models for a given question.
+"""Prompts for scoring the outputs of a models for a given question.

-This prompt is used to compare two responses and evaluate which one best follows the instructions
+This prompt is used to socre the responses and evaluate how it follows the instructions
 and answers the question. The prompt is based on the paper from
 Zheng, et. al. https://arxiv.org/abs/2306.05685
 """
 # flake8: noqa
 from langchain.prompts.chat import ChatPromptTemplate

-SYSTEM_MESSAGE = 'You are a helpful assistant.'
+SYSTEM_MESSAGE = "You are a helpful assistant."

 CRITERIA_INSTRUCTIONS = (
    "For this evaluation, you should primarily consider the following criteria:\n"
 )

-DEFAULT_CRITERIA = (
-    " Your evaluation \
+DEFAULT_CRITERIA = " Your evaluation \
 should consider factors such as the helpfulness, relevance, accuracy, \
 depth, creativity, and level of detail of the response."
-)

-COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+SCORING_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        ("system", SYSTEM_MESSAGE),
        (
            "human",
-            "[Instruction]\nPlease act as an impartial judge \
+            '[Instruction]\nPlease act as an impartial judge \
 and evaluate the quality of the response provided by an AI \
 assistant to the user question displayed below. {criteria}Begin your evaluation \
 by providing a short explanation. Be as objective as possible. \
 After providing your explanation, you must rate the response on a scale of 1 to 10 \
-by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
-[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
-[The End of Assistant's Answer]",
+by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
+[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\
+[The End of Assistant\'s Answer]',
        ),
    ]
 )

-COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
    [
        ("system", SYSTEM_MESSAGE),
        (
            "human",
-            "[Instruction]\nPlease act as an impartial judge \
+            '[Instruction]\nPlease act as an impartial judge \
 and evaluate the quality of the response provided by an AI \
 assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
 by providing a short explanation. Be as objective as possible. \
 After providing your explanation, you must rate the response on a scale of 1 to 10 \
-by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n\
-[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n\
-[The End of Assistant's Answer]",
+by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
+[Question]\n{question}\n\n[The Start of Assistant\'s Answer]\n{answer}\n\
+[The End of Assistant\'s Answer]',
        ),
    ]
 )