Compare commits

...

2 Commits

Author SHA1 Message Date
William Fu-Hinthorn
40cbac7b29 tmp 2023-07-14 06:49:42 -07:00
William Fu-Hinthorn
c9d44f0ba3 automapper 2023-07-13 23:44:36 -07:00
3 changed files with 129 additions and 83 deletions

View File

@@ -4,18 +4,23 @@ from typing import Any, Dict, List, Optional, Union
from langsmith import RunEvaluator
from pydantic import BaseModel, Field
from langchain.chat_models.openai import ChatOpenAI
from langchain.embeddings.base import Embeddings
from langchain.evaluation.criteria.eval_chain import CRITERIA_TYPE
from langchain.evaluation.embedding_distance.base import (
EmbeddingDistance as EmbeddingDistanceEnum,
)
from langchain.evaluation.loading import load_evaluator
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
from langchain.evaluation.string_distance.base import (
StringDistance as StringDistanceEnum,
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.prompt_template import BasePromptTemplate
from langsmith.schemas import DataType, RunTypeEnum
from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain
class EvalConfig(BaseModel):
@@ -226,3 +231,90 @@ class RunEvalConfig(BaseModel):
prompt: Optional[BasePromptTemplate] = None
# TODO: Trajectory
def get_run_evaluators(
self,
run_type: Optional[RunTypeEnum] = None,
data_type: Optional[DataType] = None,
example_outputs: Optional[List[str]] = None,
reference_key: Optional[str] = None,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
) -> List[RunEvaluator]:
eval_llm = self.eval_llm or ChatOpenAI(model="gpt-4", temperature=0.0)
run_evaluators: List[RunEvaluator] = []
for eval_config in self.evaluators:
run_evaluator = self._construct_run_evaluator(
eval_config,
eval_llm,
run_type,
data_type,
example_outputs,
reference_key,
input_key,
prediction_key,
)
run_evaluators.append(run_evaluator)
custom_evaluators = self.custom_evaluators or []
for custom_evaluator in custom_evaluators:
if isinstance(custom_evaluator, RunEvaluator):
run_evaluators.append(custom_evaluator)
elif isinstance(custom_evaluator, StringEvaluator):
run_evaluators.append(
StringRunEvaluatorChain.from_string_evaluator(
custom_evaluator,
run_type=run_type,
data_type=data_type,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
)
)
else:
raise ValueError(
f"Unsupported custom evaluator: {custom_evaluator}."
f" Expected RunEvaluator or StringEvaluator."
)
return run_evaluators
def _construct_run_evaluator(
eval_config: Union[EvaluatorType, EvalConfig],
eval_llm: BaseLanguageModel,
run_type: RunTypeEnum,
data_type: DataType,
example_outputs: Optional[List[str]],
reference_key: Optional[str],
input_key: Optional[str],
prediction_key: Optional[str],
) -> RunEvaluator:
if isinstance(eval_config, EvaluatorType):
evaluator_ = load_evaluator(eval_config, llm=eval_llm)
eval_type_tag = eval_config.value
else:
evaluator_ = load_evaluator(
eval_config.evaluator_type, llm=eval_llm, **eval_config.get_kwargs()
)
eval_type_tag = eval_config.evaluator_type.value
if isinstance(evaluator_, StringEvaluator):
if evaluator_.requires_reference and reference_key is None:
raise ValueError(
f"Must specify reference_key in RunEvalConfig to use"
f" evaluator of type {eval_type_tag} with"
f" dataset with multiple output keys: {example_outputs}."
)
run_evaluator = StringRunEvaluatorChain.from_string_evaluator(
evaluator_,
run_type,
data_type,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
tags=[eval_type_tag],
)
else:
raise NotImplementedError(
f"Run evaluator for {eval_type_tag} is not implemented"
)
return run_evaluator

View File

@@ -316,7 +316,7 @@ def _setup_evaluation(
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
examples: Iterator[Example],
evaluation: Optional[RunEvalConfig],
data_type: DataType,
data_type: Optional[DataType],
) -> Tuple[Optional[List[RunEvaluator]], Iterator[Example]]:
"""Configure the evaluators to run on the results of the chain."""
if evaluation:
@@ -411,52 +411,12 @@ def _determine_reference_key(
return reference_key
def _construct_run_evaluator(
eval_config: Union[EvaluatorType, EvalConfig],
eval_llm: BaseLanguageModel,
run_type: RunTypeEnum,
data_type: DataType,
example_outputs: Optional[List[str]],
reference_key: Optional[str],
input_key: Optional[str],
prediction_key: Optional[str],
) -> RunEvaluator:
if isinstance(eval_config, EvaluatorType):
evaluator_ = load_evaluator(eval_config, llm=eval_llm)
eval_type_tag = eval_config.value
else:
evaluator_ = load_evaluator(
eval_config.evaluator_type, llm=eval_llm, **eval_config.get_kwargs()
)
eval_type_tag = eval_config.evaluator_type.value
if isinstance(evaluator_, StringEvaluator):
if evaluator_.requires_reference and reference_key is None:
raise ValueError(
f"Must specify reference_key in RunEvalConfig to use"
f" evaluator of type {eval_type_tag} with"
f" dataset with multiple output keys: {example_outputs}."
)
run_evaluator = StringRunEvaluatorChain.from_run_and_data_type(
evaluator_,
run_type,
data_type,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
tags=[eval_type_tag],
)
else:
raise NotImplementedError(
f"Run evaluator for {eval_type_tag} is not implemented"
)
return run_evaluator
def _load_run_evaluators(
config: RunEvalConfig,
run_type: RunTypeEnum,
data_type: DataType,
data_type: Optional[DataType],
example_outputs: Optional[List[str]],
run_inputs: Optional[List[str]],
run_outputs: Optional[List[str]],
@@ -470,45 +430,11 @@ def _load_run_evaluators(
Returns:
A list of run evaluators.
"""
eval_llm = config.eval_llm or ChatOpenAI(model="gpt-4", temperature=0.0)
run_evaluators = []
input_key = _determine_input_key(config, run_inputs, run_type)
prediction_key = _determine_prediction_key(config, run_outputs, run_type)
reference_key = _determine_reference_key(config, example_outputs)
for eval_config in config.evaluators:
run_evaluator = _construct_run_evaluator(
eval_config,
eval_llm,
run_type,
data_type,
example_outputs,
reference_key,
input_key,
prediction_key,
)
run_evaluators.append(run_evaluator)
custom_evaluators = config.custom_evaluators or []
for custom_evaluator in custom_evaluators:
if isinstance(custom_evaluator, RunEvaluator):
run_evaluators.append(custom_evaluator)
elif isinstance(custom_evaluator, StringEvaluator):
run_evaluators.append(
StringRunEvaluatorChain.from_run_and_data_type(
custom_evaluator,
run_type,
data_type,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
)
)
else:
raise ValueError(
f"Unsupported custom evaluator: {custom_evaluator}."
f" Expected RunEvaluator or StringEvaluator."
)
return run_evaluators
### Async Helpers

View File

@@ -223,6 +223,31 @@ class StringExampleMapper(Serializable):
return self.map(example)
class AutoStringRunMapper(StringRunMapper):
"""Automatically select the appropriate StringRunMapper based on the run type."""
def __init__(
self,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
**kwargs: Any,
):
super().__init__(**kwargs)
self.mappers = {
"llm": LLMStringRunMapper(),
"chain": ChainStringRunMapper(
input_key=input_key, prediction_key=prediction_key
),
"tool": ToolStringRunMapper(),
}
def map(self, run: Run) -> Dict[str, str]:
mapper = self.mappers.get(run.run_type)
if mapper is None:
raise ValueError(f"Unsupported run type: {run.run_type}")
return mapper(run)
class StringRunEvaluatorChain(Chain, RunEvaluator):
"""Evaluate Run and optional examples."""
@@ -325,11 +350,12 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
return self._prepare_evaluator_output(result)
@classmethod
def from_run_and_data_type(
def from_string_evaluator(
cls,
evaluator: StringEvaluator,
run_type: RunTypeEnum,
data_type: DataType,
*,
run_type: Optional[RunTypeEnum] = None,
data_type: DataType = DataType.kv,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
@@ -344,7 +370,7 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
Args:
evaluator (StringEvaluator): The string evaluator to use.
run_type (RunTypeEnum): The type of run being evaluated.
run_type (RunTypeEnum, optional): The type of run being evaluated.
Supported types are LLM and Chain.
data_type (DataType): The type of dataset used in the run.
input_key (str, optional): The key used to map the input from the run.
@@ -362,8 +388,10 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
""" # noqa: E501
# Configure how run inputs/predictions are passed to the evaluator
if run_type == RunTypeEnum.llm:
run_mapper: StringRunMapper = LLMStringRunMapper()
if run_type is None:
run_mapper: StringRunMapper = AutoStringRunMapper()
elif run_type == RunTypeEnum.llm:
run_mapper = LLMStringRunMapper()
elif run_type == RunTypeEnum.chain:
run_mapper = ChainStringRunMapper(
input_key=input_key, prediction_key=prediction_key