Compare commits

...

5 Commits

Author SHA1 Message Date
William Fu-Hinthorn
96b6009652 Merge branch 'master' into wfh/criteria_strat 2023-08-14 20:54:22 -07:00
William Fu-Hinthorn
766aff97c3 update 2023-08-14 16:54:43 -07:00
William Fu-Hinthorn
72ca44768c spelling 2023-07-28 18:27:40 -07:00
William Fu-Hinthorn
8f602ee7ef Add other strategies for evaluation 2023-07-28 17:33:58 -07:00
William Fu-Hinthorn
18a4e8fe56 Criteria strategy 2023-07-28 12:31:31 -07:00
4 changed files with 218 additions and 26 deletions

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import re
from enum import Enum
from typing import Any, Dict, List, Mapping, Optional, Union
@@ -8,7 +9,7 @@ from pydantic_v1 import Extra, Field
from langchain.callbacks.manager import Callbacks
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.chains.llm import LLMChain
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
from langchain.evaluation.criteria.prompt import STRATEGY_TYPE, get_prompt_template
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
from langchain.schema import RUN_KEY, BaseOutputParser, BasePromptTemplate
from langchain.schema.language_model import BaseLanguageModel
@@ -59,7 +60,7 @@ _SUPPORTED_CRITERIA = {
class CriteriaResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the CriteriaEvalChain."""
"""A parser for the output of the binary strategy of the CriteriaEvalChain."""
@property
def _type(self) -> str:
@@ -74,17 +75,106 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):
Returns:
Dict: The parsed output.
"""
verdict_search = re.search(r"(?<=\s)[YN](?=[\s\W]*$)", text, re.IGNORECASE)
if verdict_search:
verdict = verdict_search.group().upper()
reasoning_index = verdict_search.start()
reasoning = text[:reasoning_index].strip()
else:
reasoning = text.strip()
verdict = None
score = 1 if verdict == "Y" else (0 if verdict == "N" else None)
return {
"reasoning": reasoning,
"value": verdict,
"score": score,
}
class ScoringResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the scoring strategy of the CriteriaEvalChain."""
@property
def _type(self) -> str:
return "scoring_result"
def parse(self, text: str) -> Any:
"""Parse the output text.
Args:
text (str): The output text to parse.
Returns:
Any: The parsed output.
"""
parsed = text.strip().rsplit("\n", maxsplit=1)
if len(parsed) == 1:
reasoning = ""
verdict = parsed[0]
score = parsed[0]
else:
reasoning, verdict = parsed
score = 1 if verdict.upper() == "Y" else (0 if verdict.upper() == "N" else None)
reasoning, score = parsed
score_ = int(score) if score.isdigit() and 0 <= int(score) <= 9 else None
scaled_score = score_ / 10 if score_ is not None else score_
return {
"reasoning": reasoning.strip(),
"value": verdict,
"score": score,
"value": score,
"score": scaled_score,
}
class ConfidenceResultOutputParser(BaseOutputParser[dict]):
"""A parser for output of the confidence strategy of the CriteriaEvalChain.
Used for Likert-scale like scores.
"""
@property
def _type(self) -> str:
return "confidence_result"
def parse(self, text: str) -> Any:
"""Parse the output text.
Args:
text (str): The output text to parse.
Returns:
Any: The parsed output.
"""
confidence_scale = [
"extremely confident no",
"very confident no",
"slightly confident no",
"somewhat confident no",
"unsure",
"somewhat confident yes",
"slightly confident yes",
"very confident yes",
"extremely confident yes",
]
confidence_pattern = (
r"\b(?:" + "|".join(map(re.escape, confidence_scale)) + r")\b"
)
confidence_search = re.findall(confidence_pattern, text, re.IGNORECASE)
confidence = confidence_search[-1] if confidence_search else None
reasoning_index = text.rfind(confidence) if confidence else len(text)
reasoning = text[:reasoning_index].strip()
to_match = confidence.lower() if confidence else None
score = (
confidence_scale.index(to_match) if to_match in confidence_scale else None
)
scaled_score = (
score / (len(confidence_scale) - 1) if score is not None else None
)
return {
"reasoning": reasoning,
"value": confidence,
"score": scaled_score,
}
@@ -243,10 +333,14 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
@classmethod
def _resolve_prompt(
cls, prompt: Optional[BasePromptTemplate] = None
cls,
prompt: Optional[BasePromptTemplate] = None,
strategy: STRATEGY_TYPE = "binary",
) -> BasePromptTemplate:
expected_input_vars = {"input", "output", "criteria"}
prompt_ = prompt or PROMPT
prompt_ = prompt or get_prompt_template(
strategy=strategy, requires_references=False
)
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
@@ -254,6 +348,28 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
return prompt_
@classmethod
def _resolve_parser(
cls,
output_parser: Optional[BaseOutputParser] = None,
strategy: STRATEGY_TYPE = "binary",
) -> BaseOutputParser:
if output_parser is not None:
return output_parser
parser_map = {
"binary": CriteriaResultOutputParser(),
"score": ScoringResultOutputParser(),
"confidence": ConfidenceResultOutputParser(),
}
if strategy not in parser_map:
raise ValueError(
f"Evaluation strategy {strategy} not recognized."
"\nPlease select one of 'binary', 'score', or 'confidence'"
" or specify your own output_parser when loading"
" the criteria eval chain"
)
return parser_map[strategy]
@classmethod
def resolve_criteria(
cls,
@@ -289,6 +405,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
criteria: Optional[CRITERIA_TYPE] = None,
*,
prompt: Optional[BasePromptTemplate] = None,
strategy: STRATEGY_TYPE = "binary",
output_parser: Optional[BaseOutputParser] = None,
**kwargs: Any,
) -> CriteriaEvalChain:
"""Create a `CriteriaEvalChain` instance from an llm and criteria.
@@ -305,6 +423,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
prompt : Optional[BasePromptTemplate], default=None
The prompt template to use for generating prompts. If not provided,
a default prompt template will be used.
strategy: str ("binary", "score", "confidence") - the scoring strategy.
Default to "binary" (Yes/No)
output_parser: Optional[BaseOutputParser] - the output parser to extract
the score and reasoning from the LLM output.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
constructor.
@@ -330,7 +452,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
criteria=criteria,
)
"""
prompt_ = cls._resolve_prompt(prompt)
prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
if criteria == Criteria.CORRECTNESS:
raise ValueError(
"Correctness should not be used in the reference-free"
@@ -345,6 +468,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
llm=llm,
prompt=prompt_,
criterion_name="-".join(criteria_),
output_parser=parser,
**kwargs,
)
@@ -491,10 +615,14 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
@classmethod
def _resolve_prompt(
cls, prompt: Optional[BasePromptTemplate] = None
cls,
prompt: Optional[BasePromptTemplate] = None,
strategy: STRATEGY_TYPE = "binary",
) -> BasePromptTemplate:
expected_input_vars = {"input", "output", "criteria", "reference"}
prompt_ = prompt or PROMPT_WITH_REFERENCES
prompt_ = prompt or get_prompt_template(
strategy=strategy, requires_references=True
)
if expected_input_vars != set(prompt_.input_variables):
raise ValueError(
f"Input variables should be {expected_input_vars}, "
@@ -509,6 +637,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
criteria: Optional[CRITERIA_TYPE] = None,
*,
prompt: Optional[BasePromptTemplate] = None,
strategy: STRATEGY_TYPE = "binary",
output_parser: Optional[BaseOutputParser] = None,
**kwargs: Any,
) -> CriteriaEvalChain:
"""Create a `LabeledCriteriaEvalChain` instance from an llm and criteria.
@@ -525,6 +655,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
prompt : Optional[BasePromptTemplate], default=None
The prompt template to use for generating prompts. If not provided,
a default prompt will be used.
output_parser: Optional[BaseOutputParser] - the output parser to extract
the score and reasoning from the LLM output.
**kwargs : Any
Additional keyword arguments to pass to the `LLMChain`
constructor.
@@ -550,13 +682,15 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
criteria=criteria,
)
"""
prompt = cls._resolve_prompt(prompt)
prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
criteria_ = cls.resolve_criteria(criteria)
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
prompt_ = prompt.partial(criteria=criteria_str)
prompt_ = prompt_.partial(criteria=criteria_str)
return cls(
llm=llm,
prompt=prompt_,
criterion_name="-".join(criteria_),
output_parser=parser,
**kwargs,
)

View File

@@ -1,9 +1,37 @@
# flake8: noqa
# Credit to https://github.com/openai/evals/tree/main
from typing import Literal, Union
from langchain.prompts import PromptTemplate
template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
BINARY_STRATEGY = """Does the submission meet the criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter "Y" or "N" again by itself on a new line."""
# Score on a scale from 1 to 10
SCORING_STRATEGY = """How well does the submission meet the criteria? First, write out in a step by step manner your\
reasoning about each criterion to ensure that your conclusion is accurate. Avoid simply stating the scores at the outset.\
After evaluating each criterion, assign a score from 0 to 9 where 0 means the submission does not meet the\
criteria at all or the criteria does not describe the submission at all and 9 means the submission fully meets\
the criteria or the criteria perfectly describes the submission. Print the numeric score\
(from 0 to 9, without quotes or punctuation) on its own line. At the end, repeat just the numeric score again by itself on a new line."""
CONFIDENCE_STRATEGY = """How confident are you that the submission meets the criteria? Think carefully about each\
criterion and your confidence that the submission complies with the criteria. "Extremely confident no" means you are\
certain the submission does not meet the criteria, or the criteria in no way describes the submission.
Reason by thinking step by step, then assign your confidence level using the following scale:
1. [[Extremely confident no]]
2. [[Very confident no]]
3. [[Slightly confident no]]
4. [[Somewhat confident no]]
5. [[Unsure]]
6. [[Somewhat confident yes]]
7. [[Slightly confident yes]]
8. [[Very confident yes]]
9. [[Extremely confident yes]]
Then print the corresponding confidence level in words on its own line. At the end, repeat the confidence level again, in words as it is shown above, by itself on a new line."""
_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
@@ -13,13 +41,12 @@ template = """You are assessing a submitted answer on a given task or input base
[Criteria]: {criteria}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
"""
PROMPT = PromptTemplate(
input_variables=["input", "output", "criteria"], template=template
)
PROMPT = PromptTemplate.from_template(_TEMPLATE + BINARY_STRATEGY)
template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
_LABELED_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
[BEGIN DATA]
***
[Input]: {input}
@@ -31,8 +58,32 @@ template = """You are assessing a submitted answer on a given task or input base
[Reference]: {reference}
***
[END DATA]
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
"""
PROMPT_WITH_REFERENCES = PromptTemplate(
input_variables=["input", "output", "criteria", "reference"], template=template
PROMPT_WITH_REFERENCES = PromptTemplate.from_template(
_LABELED_TEMPLATE + BINARY_STRATEGY
)
STRATEGY_TYPE = Union[Literal["binary"], Literal["score"], Literal["confidence"]]
def get_prompt_template(
requires_references: bool,
strategy: Union[
Literal["binary"], Literal["score"], Literal["confidence"]
] = "binary",
) -> PromptTemplate:
"""Get the prompt template for the specified strategy and model type."""
strat_map = {
"binary": BINARY_STRATEGY,
"score": SCORING_STRATEGY,
"confidence": CONFIDENCE_STRATEGY,
}
if strategy not in strat_map:
raise ValueError(
f"Unrecognized evaluation strategy {strategy}"
f"\nMust be one of {list(strat_map.keys())}"
)
template = _LABELED_TEMPLATE if requires_references else _TEMPLATE
suffix = strat_map[strategy]
return PromptTemplate.from_template(template + suffix + "\nReasoning:")

View File

@@ -7,6 +7,7 @@ from pydantic_v1 import BaseModel, Field
from langchain.embeddings.base import Embeddings
from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
from langchain.evaluation.criteria.prompt import STRATEGY_TYPE
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistance as EmbeddingDistanceEnum,
)
@@ -15,6 +16,7 @@ from langchain.evaluation.string_distance.base import (
StringDistance as StringDistanceEnum,
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.output_parser import BaseOutputParser
from langchain.schema.prompt_template import BasePromptTemplate
@@ -124,6 +126,11 @@ class RunEvalConfig(BaseModel):
criteria: Optional[CRITERIA_TYPE] = None
llm: Optional[BaseLanguageModel] = None
evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
strategy: STRATEGY_TYPE = "binary"
output_parser: Optional[BaseOutputParser] = None
class Config:
allow_extra = False
def __init__(
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any

View File

@@ -1099,11 +1099,11 @@ def _run_on_examples(
wrapped_model, examples, evaluation, data_type
)
examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
evalution_handler = EvaluatorCallbackHandler(
evaluation_handler = EvaluatorCallbackHandler(
evaluators=run_evaluators or [],
client=client,
)
callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler]
for i, example in enumerate(examples):
result = _run_llm_or_chain(
example,
@@ -1117,7 +1117,7 @@ def _run_on_examples(
print(f"{i+1} processed", flush=True, end="\r")
results[str(example.id)] = result
tracer.wait_for_futures()
evalution_handler.wait_for_futures()
evaluation_handler.wait_for_futures()
return results