Merge branch 'master' into wfh/criteria_strat

update
spelling
2026-02-05 00:30:18 +00:00 · 2023-08-14 20:54:22 -07:00 · 2023-08-14 16:54:43 -07:00 · 2023-07-28 18:27:40 -07:00 · 2023-07-28 17:33:58 -07:00 · 2023-07-28 12:31:31 -07:00
4 changed files with 218 additions and 26 deletions
--- a/libs/langchain/langchain/evaluation/criteria/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/criteria/eval_chain.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import re
 from enum import Enum
 from typing import Any, Dict, List, Mapping, Optional, Union

@@ -8,7 +9,7 @@ from pydantic_v1 import Extra, Field
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
-from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
+from langchain.evaluation.criteria.prompt import STRATEGY_TYPE, get_prompt_template
 from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
 from langchain.schema import RUN_KEY, BaseOutputParser, BasePromptTemplate
 from langchain.schema.language_model import BaseLanguageModel
@@ -59,7 +60,7 @@ _SUPPORTED_CRITERIA = {


 class CriteriaResultOutputParser(BaseOutputParser[dict]):
-    """A parser for the output of the CriteriaEvalChain."""
+    """A parser for the output of the binary strategy of the CriteriaEvalChain."""

    @property
    def _type(self) -> str:
@@ -74,17 +75,106 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):
        Returns:
            Dict: The parsed output.
        """
+        verdict_search = re.search(r"(?<=\s)[YN](?=[\s\W]*$)", text, re.IGNORECASE)
+
+        if verdict_search:
+            verdict = verdict_search.group().upper()
+            reasoning_index = verdict_search.start()
+            reasoning = text[:reasoning_index].strip()
+        else:
+            reasoning = text.strip()
+            verdict = None
+
+        score = 1 if verdict == "Y" else (0 if verdict == "N" else None)
+        return {
+            "reasoning": reasoning,
+            "value": verdict,
+            "score": score,
+        }
+
+
+class ScoringResultOutputParser(BaseOutputParser[dict]):
+    """A parser for the output of the scoring strategy of the CriteriaEvalChain."""
+
+    @property
+    def _type(self) -> str:
+        return "scoring_result"
+
+    def parse(self, text: str) -> Any:
+        """Parse the output text.
+
+        Args:
+            text (str): The output text to parse.
+
+        Returns:
+            Any: The parsed output.
+        """
        parsed = text.strip().rsplit("\n", maxsplit=1)
        if len(parsed) == 1:
            reasoning = ""
-            verdict = parsed[0]
+            score = parsed[0]
        else:
-            reasoning, verdict = parsed
-        score = 1 if verdict.upper() == "Y" else (0 if verdict.upper() == "N" else None)
+            reasoning, score = parsed
+        score_ = int(score) if score.isdigit() and 0 <= int(score) <= 9 else None
+        scaled_score = score_ / 10 if score_ is not None else score_
        return {
            "reasoning": reasoning.strip(),
-            "value": verdict,
-            "score": score,
+            "value": score,
+            "score": scaled_score,
+        }
+
+
+class ConfidenceResultOutputParser(BaseOutputParser[dict]):
+    """A parser for output of the confidence strategy of the CriteriaEvalChain.
+
+    Used for Likert-scale like scores.
+    """
+
+    @property
+    def _type(self) -> str:
+        return "confidence_result"
+
+    def parse(self, text: str) -> Any:
+        """Parse the output text.
+
+        Args:
+            text (str): The output text to parse.
+
+        Returns:
+            Any: The parsed output.
+        """
+        confidence_scale = [
+            "extremely confident no",
+            "very confident no",
+            "slightly confident no",
+            "somewhat confident no",
+            "unsure",
+            "somewhat confident yes",
+            "slightly confident yes",
+            "very confident yes",
+            "extremely confident yes",
+        ]
+
+        confidence_pattern = (
+            r"\b(?:" + "|".join(map(re.escape, confidence_scale)) + r")\b"
+        )
+        confidence_search = re.findall(confidence_pattern, text, re.IGNORECASE)
+
+        confidence = confidence_search[-1] if confidence_search else None
+        reasoning_index = text.rfind(confidence) if confidence else len(text)
+        reasoning = text[:reasoning_index].strip()
+
+        to_match = confidence.lower() if confidence else None
+        score = (
+            confidence_scale.index(to_match) if to_match in confidence_scale else None
+        )
+        scaled_score = (
+            score / (len(confidence_scale) - 1) if score is not None else None
+        )
+        return {
+            "reasoning": reasoning,
+            "value": confidence,
+            "score": scaled_score,
        }


@@ -243,10 +333,14 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

    @classmethod
    def _resolve_prompt(
-        cls, prompt: Optional[BasePromptTemplate] = None
+        cls,
+        prompt: Optional[BasePromptTemplate] = None,
+        strategy: STRATEGY_TYPE = "binary",
    ) -> BasePromptTemplate:
        expected_input_vars = {"input", "output", "criteria"}
-        prompt_ = prompt or PROMPT
+        prompt_ = prompt or get_prompt_template(
+            strategy=strategy, requires_references=False
+        )
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@@ -254,6 +348,28 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            )
        return prompt_

+    @classmethod
+    def _resolve_parser(
+        cls,
+        output_parser: Optional[BaseOutputParser] = None,
+        strategy: STRATEGY_TYPE = "binary",
+    ) -> BaseOutputParser:
+        if output_parser is not None:
+            return output_parser
+        parser_map = {
+            "binary": CriteriaResultOutputParser(),
+            "score": ScoringResultOutputParser(),
+            "confidence": ConfidenceResultOutputParser(),
+        }
+        if strategy not in parser_map:
+            raise ValueError(
+                f"Evaluation strategy {strategy} not recognized."
+                "\nPlease select one of 'binary', 'score', or 'confidence'"
+                " or specify your own output_parser when loading"
+                " the criteria eval chain"
+            )
+        return parser_map[strategy]
+
    @classmethod
    def resolve_criteria(
        cls,
@@ -289,6 +405,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        criteria: Optional[CRITERIA_TYPE] = None,
        *,
        prompt: Optional[BasePromptTemplate] = None,
+        strategy: STRATEGY_TYPE = "binary",
+        output_parser: Optional[BaseOutputParser] = None,
        **kwargs: Any,
    ) -> CriteriaEvalChain:
        """Create a `CriteriaEvalChain` instance from an llm and criteria.
@@ -305,6 +423,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
        prompt : Optional[BasePromptTemplate], default=None
            The prompt template to use for generating prompts. If not provided,
            a default prompt template will be used.
+        strategy: str ("binary", "score", "confidence") - the scoring strategy.
+            Default to "binary" (Yes/No)
+        output_parser: Optional[BaseOutputParser] - the output parser to extract
+            the score and reasoning from the LLM output.
        **kwargs : Any
            Additional keyword arguments to pass to the `LLMChain`
            constructor.
@@ -330,7 +452,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
                criteria=criteria,
            )
        """
-        prompt_ = cls._resolve_prompt(prompt)
+        prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
+        parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
        if criteria == Criteria.CORRECTNESS:
            raise ValueError(
                "Correctness should not be used in the reference-free"
@@ -345,6 +468,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            llm=llm,
            prompt=prompt_,
            criterion_name="-".join(criteria_),
+            output_parser=parser,
            **kwargs,
        )

@@ -491,10 +615,14 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):

    @classmethod
    def _resolve_prompt(
-        cls, prompt: Optional[BasePromptTemplate] = None
+        cls,
+        prompt: Optional[BasePromptTemplate] = None,
+        strategy: STRATEGY_TYPE = "binary",
    ) -> BasePromptTemplate:
        expected_input_vars = {"input", "output", "criteria", "reference"}
-        prompt_ = prompt or PROMPT_WITH_REFERENCES
+        prompt_ = prompt or get_prompt_template(
+            strategy=strategy, requires_references=True
+        )
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@@ -509,6 +637,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
        criteria: Optional[CRITERIA_TYPE] = None,
        *,
        prompt: Optional[BasePromptTemplate] = None,
+        strategy: STRATEGY_TYPE = "binary",
+        output_parser: Optional[BaseOutputParser] = None,
        **kwargs: Any,
    ) -> CriteriaEvalChain:
        """Create a `LabeledCriteriaEvalChain` instance from an llm and criteria.
@@ -525,6 +655,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
        prompt : Optional[BasePromptTemplate], default=None
            The prompt template to use for generating prompts. If not provided,
            a default prompt will be used.
+        output_parser: Optional[BaseOutputParser] - the output parser to extract
+            the score and reasoning from the LLM output.
        **kwargs : Any
            Additional keyword arguments to pass to the `LLMChain`
            constructor.
@@ -550,13 +682,15 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
                criteria=criteria,
            )
        """
-        prompt = cls._resolve_prompt(prompt)
+        prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
+        parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
        criteria_ = cls.resolve_criteria(criteria)
        criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
-        prompt_ = prompt.partial(criteria=criteria_str)
+        prompt_ = prompt_.partial(criteria=criteria_str)
        return cls(
            llm=llm,
            prompt=prompt_,
            criterion_name="-".join(criteria_),
+            output_parser=parser,
            **kwargs,
        )
--- a/libs/langchain/langchain/evaluation/criteria/prompt.py
+++ b/libs/langchain/langchain/evaluation/criteria/prompt.py
@@ -1,9 +1,37 @@
 # flake8: noqa
 # Credit to https://github.com/openai/evals/tree/main

+from typing import Literal, Union
 from langchain.prompts import PromptTemplate

-template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
+
+BINARY_STRATEGY = """Does the submission meet the criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter "Y" or "N" again by itself on a new line."""
+# Score on a scale from 1 to 10
+SCORING_STRATEGY = """How well does the submission meet the criteria? First, write out in a step by step manner your\
+ reasoning about each criterion to ensure that your conclusion is accurate. Avoid simply stating the scores at the outset.\
+ After evaluating each criterion, assign a score from 0 to 9 where 0 means the submission does not meet the\
+ criteria at all or the criteria does not describe the submission at all and 9 means the submission fully meets\
+ the criteria or the criteria perfectly describes the submission. Print the numeric score\
+ (from 0 to 9, without quotes or punctuation) on its own line. At the end, repeat just the numeric score again by itself on a new line."""
+
+CONFIDENCE_STRATEGY = """How confident are you that the submission meets the criteria? Think carefully about each\
+criterion and your confidence that the submission complies with the criteria. "Extremely confident no" means you are\
+certain the submission does not meet the criteria, or the criteria in no way describes the submission.
+Reason by thinking step by step, then assign your confidence level using the following scale:
+
+1. [[Extremely confident no]]
+2. [[Very confident no]]
+3. [[Slightly confident no]]
+4. [[Somewhat confident no]]
+5. [[Unsure]]
+6. [[Somewhat confident yes]]
+7. [[Slightly confident yes]]
+8. [[Very confident yes]]
+9. [[Extremely confident yes]]
+
+Then print the corresponding confidence level in words on its own line. At the end, repeat the confidence level again, in words as it is shown above, by itself on a new line."""
+
+_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
 [BEGIN DATA]
 ***
 [Input]: {input}
@@ -13,13 +41,12 @@ template = """You are assessing a submitted answer on a given task or input base
 [Criteria]: {criteria}
 ***
 [END DATA]
-Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
+"""

-PROMPT = PromptTemplate(
-    input_variables=["input", "output", "criteria"], template=template
-)
+PROMPT = PromptTemplate.from_template(_TEMPLATE + BINARY_STRATEGY)

-template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
+
+_LABELED_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
 [BEGIN DATA]
 ***
 [Input]: {input}
@@ -31,8 +58,32 @@ template = """You are assessing a submitted answer on a given task or input base
 [Reference]: {reference}
 ***
 [END DATA]
-Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
+"""

-PROMPT_WITH_REFERENCES = PromptTemplate(
-    input_variables=["input", "output", "criteria", "reference"], template=template
+PROMPT_WITH_REFERENCES = PromptTemplate.from_template(
+    _LABELED_TEMPLATE + BINARY_STRATEGY
 )
+
+STRATEGY_TYPE = Union[Literal["binary"], Literal["score"], Literal["confidence"]]
+
+
+def get_prompt_template(
+    requires_references: bool,
+    strategy: Union[
+        Literal["binary"], Literal["score"], Literal["confidence"]
+    ] = "binary",
+) -> PromptTemplate:
+    """Get the prompt template for the specified strategy and model type."""
+    strat_map = {
+        "binary": BINARY_STRATEGY,
+        "score": SCORING_STRATEGY,
+        "confidence": CONFIDENCE_STRATEGY,
+    }
+    if strategy not in strat_map:
+        raise ValueError(
+            f"Unrecognized evaluation strategy {strategy}"
+            f"\nMust be one of {list(strat_map.keys())}"
+        )
+    template = _LABELED_TEMPLATE if requires_references else _TEMPLATE
+    suffix = strat_map[strategy]
+    return PromptTemplate.from_template(template + suffix + "\nReasoning:")
--- a/libs/langchain/langchain/smith/evaluation/config.py
+++ b/libs/langchain/langchain/smith/evaluation/config.py
@@ -7,6 +7,7 @@ from pydantic_v1 import BaseModel, Field

 from langchain.embeddings.base import Embeddings
 from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
+from langchain.evaluation.criteria.prompt import STRATEGY_TYPE
 from langchain.evaluation.embedding_distance.base import (
    EmbeddingDistance as EmbeddingDistanceEnum,
 )
@@ -15,6 +16,7 @@ from langchain.evaluation.string_distance.base import (
    StringDistance as StringDistanceEnum,
 )
 from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.output_parser import BaseOutputParser
 from langchain.schema.prompt_template import BasePromptTemplate


@@ -124,6 +126,11 @@ class RunEvalConfig(BaseModel):
        criteria: Optional[CRITERIA_TYPE] = None
        llm: Optional[BaseLanguageModel] = None
        evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
+        strategy: STRATEGY_TYPE = "binary"
+        output_parser: Optional[BaseOutputParser] = None
+
+        class Config:
+            allow_extra = False

        def __init__(
            self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -1099,11 +1099,11 @@ def _run_on_examples(
        wrapped_model, examples, evaluation, data_type
    )
    examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
-    evalution_handler = EvaluatorCallbackHandler(
+    evaluation_handler = EvaluatorCallbackHandler(
        evaluators=run_evaluators or [],
        client=client,
    )
-    callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
+    callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler]
    for i, example in enumerate(examples):
        result = _run_llm_or_chain(
            example,
@@ -1117,7 +1117,7 @@ def _run_on_examples(
            print(f"{i+1} processed", flush=True, end="\r")
        results[str(example.id)] = result
    tracer.wait_for_futures()
-    evalution_handler.wait_for_futures()
+    evaluation_handler.wait_for_futures()
    return results
Author	SHA1	Message	Date
William Fu-Hinthorn	96b6009652	Merge branch 'master' into wfh/criteria_strat	2023-08-14 20:54:22 -07:00
William Fu-Hinthorn	766aff97c3	update	2023-08-14 16:54:43 -07:00
William Fu-Hinthorn	72ca44768c	spelling	2023-07-28 18:27:40 -07:00
William Fu-Hinthorn	8f602ee7ef	Add other strategies for evaluation	2023-07-28 17:33:58 -07:00
William Fu-Hinthorn	18a4e8fe56	Criteria strategy	2023-07-28 12:31:31 -07:00