mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-05 00:30:18 +00:00
Compare commits
5 Commits
cc/0.4/doc
...
wfh/criter
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96b6009652 | ||
|
|
766aff97c3 | ||
|
|
72ca44768c | ||
|
|
8f602ee7ef | ||
|
|
18a4e8fe56 |
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Mapping, Optional, Union
|
||||
|
||||
@@ -8,7 +9,7 @@ from pydantic_v1 import Extra, Field
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
|
||||
from langchain.evaluation.criteria.prompt import STRATEGY_TYPE, get_prompt_template
|
||||
from langchain.evaluation.schema import LLMEvalChain, StringEvaluator
|
||||
from langchain.schema import RUN_KEY, BaseOutputParser, BasePromptTemplate
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
@@ -59,7 +60,7 @@ _SUPPORTED_CRITERIA = {
|
||||
|
||||
|
||||
class CriteriaResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the CriteriaEvalChain."""
|
||||
"""A parser for the output of the binary strategy of the CriteriaEvalChain."""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
@@ -74,17 +75,106 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):
|
||||
Returns:
|
||||
Dict: The parsed output.
|
||||
"""
|
||||
verdict_search = re.search(r"(?<=\s)[YN](?=[\s\W]*$)", text, re.IGNORECASE)
|
||||
|
||||
if verdict_search:
|
||||
verdict = verdict_search.group().upper()
|
||||
reasoning_index = verdict_search.start()
|
||||
reasoning = text[:reasoning_index].strip()
|
||||
else:
|
||||
reasoning = text.strip()
|
||||
verdict = None
|
||||
|
||||
score = 1 if verdict == "Y" else (0 if verdict == "N" else None)
|
||||
return {
|
||||
"reasoning": reasoning,
|
||||
"value": verdict,
|
||||
"score": score,
|
||||
}
|
||||
|
||||
|
||||
class ScoringResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the scoring strategy of the CriteriaEvalChain."""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "scoring_result"
|
||||
|
||||
def parse(self, text: str) -> Any:
|
||||
"""Parse the output text.
|
||||
|
||||
Args:
|
||||
text (str): The output text to parse.
|
||||
|
||||
Returns:
|
||||
Any: The parsed output.
|
||||
"""
|
||||
parsed = text.strip().rsplit("\n", maxsplit=1)
|
||||
if len(parsed) == 1:
|
||||
reasoning = ""
|
||||
verdict = parsed[0]
|
||||
score = parsed[0]
|
||||
else:
|
||||
reasoning, verdict = parsed
|
||||
score = 1 if verdict.upper() == "Y" else (0 if verdict.upper() == "N" else None)
|
||||
reasoning, score = parsed
|
||||
score_ = int(score) if score.isdigit() and 0 <= int(score) <= 9 else None
|
||||
scaled_score = score_ / 10 if score_ is not None else score_
|
||||
return {
|
||||
"reasoning": reasoning.strip(),
|
||||
"value": verdict,
|
||||
"score": score,
|
||||
"value": score,
|
||||
"score": scaled_score,
|
||||
}
|
||||
|
||||
|
||||
class ConfidenceResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for output of the confidence strategy of the CriteriaEvalChain.
|
||||
|
||||
Used for Likert-scale like scores.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "confidence_result"
|
||||
|
||||
def parse(self, text: str) -> Any:
|
||||
"""Parse the output text.
|
||||
|
||||
Args:
|
||||
text (str): The output text to parse.
|
||||
|
||||
Returns:
|
||||
Any: The parsed output.
|
||||
"""
|
||||
confidence_scale = [
|
||||
"extremely confident no",
|
||||
"very confident no",
|
||||
"slightly confident no",
|
||||
"somewhat confident no",
|
||||
"unsure",
|
||||
"somewhat confident yes",
|
||||
"slightly confident yes",
|
||||
"very confident yes",
|
||||
"extremely confident yes",
|
||||
]
|
||||
|
||||
confidence_pattern = (
|
||||
r"\b(?:" + "|".join(map(re.escape, confidence_scale)) + r")\b"
|
||||
)
|
||||
confidence_search = re.findall(confidence_pattern, text, re.IGNORECASE)
|
||||
|
||||
confidence = confidence_search[-1] if confidence_search else None
|
||||
reasoning_index = text.rfind(confidence) if confidence else len(text)
|
||||
reasoning = text[:reasoning_index].strip()
|
||||
|
||||
to_match = confidence.lower() if confidence else None
|
||||
score = (
|
||||
confidence_scale.index(to_match) if to_match in confidence_scale else None
|
||||
)
|
||||
scaled_score = (
|
||||
score / (len(confidence_scale) - 1) if score is not None else None
|
||||
)
|
||||
return {
|
||||
"reasoning": reasoning,
|
||||
"value": confidence,
|
||||
"score": scaled_score,
|
||||
}
|
||||
|
||||
|
||||
@@ -243,10 +333,14 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
|
||||
@classmethod
|
||||
def _resolve_prompt(
|
||||
cls, prompt: Optional[BasePromptTemplate] = None
|
||||
cls,
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
strategy: STRATEGY_TYPE = "binary",
|
||||
) -> BasePromptTemplate:
|
||||
expected_input_vars = {"input", "output", "criteria"}
|
||||
prompt_ = prompt or PROMPT
|
||||
prompt_ = prompt or get_prompt_template(
|
||||
strategy=strategy, requires_references=False
|
||||
)
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
@@ -254,6 +348,28 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
)
|
||||
return prompt_
|
||||
|
||||
@classmethod
|
||||
def _resolve_parser(
|
||||
cls,
|
||||
output_parser: Optional[BaseOutputParser] = None,
|
||||
strategy: STRATEGY_TYPE = "binary",
|
||||
) -> BaseOutputParser:
|
||||
if output_parser is not None:
|
||||
return output_parser
|
||||
parser_map = {
|
||||
"binary": CriteriaResultOutputParser(),
|
||||
"score": ScoringResultOutputParser(),
|
||||
"confidence": ConfidenceResultOutputParser(),
|
||||
}
|
||||
if strategy not in parser_map:
|
||||
raise ValueError(
|
||||
f"Evaluation strategy {strategy} not recognized."
|
||||
"\nPlease select one of 'binary', 'score', or 'confidence'"
|
||||
" or specify your own output_parser when loading"
|
||||
" the criteria eval chain"
|
||||
)
|
||||
return parser_map[strategy]
|
||||
|
||||
@classmethod
|
||||
def resolve_criteria(
|
||||
cls,
|
||||
@@ -289,6 +405,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
criteria: Optional[CRITERIA_TYPE] = None,
|
||||
*,
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
strategy: STRATEGY_TYPE = "binary",
|
||||
output_parser: Optional[BaseOutputParser] = None,
|
||||
**kwargs: Any,
|
||||
) -> CriteriaEvalChain:
|
||||
"""Create a `CriteriaEvalChain` instance from an llm and criteria.
|
||||
@@ -305,6 +423,10 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
prompt : Optional[BasePromptTemplate], default=None
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt template will be used.
|
||||
strategy: str ("binary", "score", "confidence") - the scoring strategy.
|
||||
Default to "binary" (Yes/No)
|
||||
output_parser: Optional[BaseOutputParser] - the output parser to extract
|
||||
the score and reasoning from the LLM output.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
constructor.
|
||||
@@ -330,7 +452,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
criteria=criteria,
|
||||
)
|
||||
"""
|
||||
prompt_ = cls._resolve_prompt(prompt)
|
||||
prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
|
||||
parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
|
||||
if criteria == Criteria.CORRECTNESS:
|
||||
raise ValueError(
|
||||
"Correctness should not be used in the reference-free"
|
||||
@@ -345,6 +468,7 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
|
||||
llm=llm,
|
||||
prompt=prompt_,
|
||||
criterion_name="-".join(criteria_),
|
||||
output_parser=parser,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -491,10 +615,14 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
|
||||
@classmethod
|
||||
def _resolve_prompt(
|
||||
cls, prompt: Optional[BasePromptTemplate] = None
|
||||
cls,
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
strategy: STRATEGY_TYPE = "binary",
|
||||
) -> BasePromptTemplate:
|
||||
expected_input_vars = {"input", "output", "criteria", "reference"}
|
||||
prompt_ = prompt or PROMPT_WITH_REFERENCES
|
||||
prompt_ = prompt or get_prompt_template(
|
||||
strategy=strategy, requires_references=True
|
||||
)
|
||||
if expected_input_vars != set(prompt_.input_variables):
|
||||
raise ValueError(
|
||||
f"Input variables should be {expected_input_vars}, "
|
||||
@@ -509,6 +637,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
criteria: Optional[CRITERIA_TYPE] = None,
|
||||
*,
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
strategy: STRATEGY_TYPE = "binary",
|
||||
output_parser: Optional[BaseOutputParser] = None,
|
||||
**kwargs: Any,
|
||||
) -> CriteriaEvalChain:
|
||||
"""Create a `LabeledCriteriaEvalChain` instance from an llm and criteria.
|
||||
@@ -525,6 +655,8 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
prompt : Optional[BasePromptTemplate], default=None
|
||||
The prompt template to use for generating prompts. If not provided,
|
||||
a default prompt will be used.
|
||||
output_parser: Optional[BaseOutputParser] - the output parser to extract
|
||||
the score and reasoning from the LLM output.
|
||||
**kwargs : Any
|
||||
Additional keyword arguments to pass to the `LLMChain`
|
||||
constructor.
|
||||
@@ -550,13 +682,15 @@ class LabeledCriteriaEvalChain(CriteriaEvalChain):
|
||||
criteria=criteria,
|
||||
)
|
||||
"""
|
||||
prompt = cls._resolve_prompt(prompt)
|
||||
prompt_ = cls._resolve_prompt(prompt, strategy=strategy)
|
||||
parser = cls._resolve_parser(output_parser=output_parser, strategy=strategy)
|
||||
criteria_ = cls.resolve_criteria(criteria)
|
||||
criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
|
||||
prompt_ = prompt.partial(criteria=criteria_str)
|
||||
prompt_ = prompt_.partial(criteria=criteria_str)
|
||||
return cls(
|
||||
llm=llm,
|
||||
prompt=prompt_,
|
||||
criterion_name="-".join(criteria_),
|
||||
output_parser=parser,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -1,9 +1,37 @@
|
||||
# flake8: noqa
|
||||
# Credit to https://github.com/openai/evals/tree/main
|
||||
|
||||
from typing import Literal, Union
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
|
||||
|
||||
BINARY_STRATEGY = """Does the submission meet the criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter "Y" or "N" again by itself on a new line."""
|
||||
# Score on a scale from 1 to 10
|
||||
SCORING_STRATEGY = """How well does the submission meet the criteria? First, write out in a step by step manner your\
|
||||
reasoning about each criterion to ensure that your conclusion is accurate. Avoid simply stating the scores at the outset.\
|
||||
After evaluating each criterion, assign a score from 0 to 9 where 0 means the submission does not meet the\
|
||||
criteria at all or the criteria does not describe the submission at all and 9 means the submission fully meets\
|
||||
the criteria or the criteria perfectly describes the submission. Print the numeric score\
|
||||
(from 0 to 9, without quotes or punctuation) on its own line. At the end, repeat just the numeric score again by itself on a new line."""
|
||||
|
||||
CONFIDENCE_STRATEGY = """How confident are you that the submission meets the criteria? Think carefully about each\
|
||||
criterion and your confidence that the submission complies with the criteria. "Extremely confident no" means you are\
|
||||
certain the submission does not meet the criteria, or the criteria in no way describes the submission.
|
||||
Reason by thinking step by step, then assign your confidence level using the following scale:
|
||||
|
||||
1. [[Extremely confident no]]
|
||||
2. [[Very confident no]]
|
||||
3. [[Slightly confident no]]
|
||||
4. [[Somewhat confident no]]
|
||||
5. [[Unsure]]
|
||||
6. [[Somewhat confident yes]]
|
||||
7. [[Slightly confident yes]]
|
||||
8. [[Very confident yes]]
|
||||
9. [[Extremely confident yes]]
|
||||
|
||||
Then print the corresponding confidence level in words on its own line. At the end, repeat the confidence level again, in words as it is shown above, by itself on a new line."""
|
||||
|
||||
_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
|
||||
[BEGIN DATA]
|
||||
***
|
||||
[Input]: {input}
|
||||
@@ -13,13 +41,12 @@ template = """You are assessing a submitted answer on a given task or input base
|
||||
[Criteria]: {criteria}
|
||||
***
|
||||
[END DATA]
|
||||
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
|
||||
"""
|
||||
|
||||
PROMPT = PromptTemplate(
|
||||
input_variables=["input", "output", "criteria"], template=template
|
||||
)
|
||||
PROMPT = PromptTemplate.from_template(_TEMPLATE + BINARY_STRATEGY)
|
||||
|
||||
template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
|
||||
|
||||
_LABELED_TEMPLATE = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
|
||||
[BEGIN DATA]
|
||||
***
|
||||
[Input]: {input}
|
||||
@@ -31,8 +58,32 @@ template = """You are assessing a submitted answer on a given task or input base
|
||||
[Reference]: {reference}
|
||||
***
|
||||
[END DATA]
|
||||
Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line."""
|
||||
"""
|
||||
|
||||
PROMPT_WITH_REFERENCES = PromptTemplate(
|
||||
input_variables=["input", "output", "criteria", "reference"], template=template
|
||||
PROMPT_WITH_REFERENCES = PromptTemplate.from_template(
|
||||
_LABELED_TEMPLATE + BINARY_STRATEGY
|
||||
)
|
||||
|
||||
STRATEGY_TYPE = Union[Literal["binary"], Literal["score"], Literal["confidence"]]
|
||||
|
||||
|
||||
def get_prompt_template(
|
||||
requires_references: bool,
|
||||
strategy: Union[
|
||||
Literal["binary"], Literal["score"], Literal["confidence"]
|
||||
] = "binary",
|
||||
) -> PromptTemplate:
|
||||
"""Get the prompt template for the specified strategy and model type."""
|
||||
strat_map = {
|
||||
"binary": BINARY_STRATEGY,
|
||||
"score": SCORING_STRATEGY,
|
||||
"confidence": CONFIDENCE_STRATEGY,
|
||||
}
|
||||
if strategy not in strat_map:
|
||||
raise ValueError(
|
||||
f"Unrecognized evaluation strategy {strategy}"
|
||||
f"\nMust be one of {list(strat_map.keys())}"
|
||||
)
|
||||
template = _LABELED_TEMPLATE if requires_references else _TEMPLATE
|
||||
suffix = strat_map[strategy]
|
||||
return PromptTemplate.from_template(template + suffix + "\nReasoning:")
|
||||
|
||||
@@ -7,6 +7,7 @@ from pydantic_v1 import BaseModel, Field
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
|
||||
from langchain.evaluation.criteria.prompt import STRATEGY_TYPE
|
||||
from langchain.evaluation.embedding_distance.base import (
|
||||
EmbeddingDistance as EmbeddingDistanceEnum,
|
||||
)
|
||||
@@ -15,6 +16,7 @@ from langchain.evaluation.string_distance.base import (
|
||||
StringDistance as StringDistanceEnum,
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.schema.output_parser import BaseOutputParser
|
||||
from langchain.schema.prompt_template import BasePromptTemplate
|
||||
|
||||
|
||||
@@ -124,6 +126,11 @@ class RunEvalConfig(BaseModel):
|
||||
criteria: Optional[CRITERIA_TYPE] = None
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
evaluator_type: EvaluatorType = EvaluatorType.CRITERIA
|
||||
strategy: STRATEGY_TYPE = "binary"
|
||||
output_parser: Optional[BaseOutputParser] = None
|
||||
|
||||
class Config:
|
||||
allow_extra = False
|
||||
|
||||
def __init__(
|
||||
self, criteria: Optional[CRITERIA_TYPE] = None, **kwargs: Any
|
||||
|
||||
@@ -1099,11 +1099,11 @@ def _run_on_examples(
|
||||
wrapped_model, examples, evaluation, data_type
|
||||
)
|
||||
examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
|
||||
evalution_handler = EvaluatorCallbackHandler(
|
||||
evaluation_handler = EvaluatorCallbackHandler(
|
||||
evaluators=run_evaluators or [],
|
||||
client=client,
|
||||
)
|
||||
callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
|
||||
callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler]
|
||||
for i, example in enumerate(examples):
|
||||
result = _run_llm_or_chain(
|
||||
example,
|
||||
@@ -1117,7 +1117,7 @@ def _run_on_examples(
|
||||
print(f"{i+1} processed", flush=True, end="\r")
|
||||
results[str(example.id)] = result
|
||||
tracer.wait_for_futures()
|
||||
evalution_handler.wait_for_futures()
|
||||
evaluation_handler.wait_for_futures()
|
||||
return results
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user