mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-14 07:07:34 +00:00
Add Schema Evals (#9228)
Simple eval checks for whether a generation is valid json and whether it matches an expected dict
This commit is contained in:
parent
74a64cfbab
commit
2519580994
@ -14,8 +14,12 @@ from langchain.evaluation.embedding_distance.base import (
|
||||
EmbeddingDistanceEvalChain,
|
||||
PairwiseEmbeddingDistanceEvalChain,
|
||||
)
|
||||
from langchain.evaluation.parsing.base import (
|
||||
JsonEqualityEvaluator,
|
||||
JsonValidityEvaluator,
|
||||
)
|
||||
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
||||
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
|
||||
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
|
||||
from langchain.evaluation.string_distance.base import (
|
||||
PairwiseStringDistanceEvalChain,
|
||||
StringDistanceEvalChain,
|
||||
@ -57,7 +61,9 @@ def load_dataset(uri: str) -> List[Dict]:
|
||||
return [d for d in dataset["train"]]
|
||||
|
||||
|
||||
_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
|
||||
_EVALUATOR_MAP: Dict[
|
||||
EvaluatorType, Union[Type[LLMEvalChain], Type[Chain], Type[StringEvaluator]]
|
||||
] = {
|
||||
EvaluatorType.QA: QAEvalChain,
|
||||
EvaluatorType.COT_QA: CotQAEvalChain,
|
||||
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
|
||||
@ -70,6 +76,8 @@ _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
|
||||
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
|
||||
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
|
||||
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
|
||||
EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
|
||||
EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
|
||||
}
|
||||
|
||||
|
||||
@ -78,7 +86,7 @@ def load_evaluator(
|
||||
*,
|
||||
llm: Optional[BaseLanguageModel] = None,
|
||||
**kwargs: Any,
|
||||
) -> Chain:
|
||||
) -> Union[Chain, StringEvaluator]:
|
||||
"""Load the requested evaluation chain specified by a string.
|
||||
|
||||
Parameters
|
||||
@ -119,7 +127,7 @@ def load_evaluators(
|
||||
llm: Optional[BaseLanguageModel] = None,
|
||||
config: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Chain]:
|
||||
) -> List[Union[Chain, StringEvaluator]]:
|
||||
"""Load evaluators specified by a list of evaluator types.
|
||||
|
||||
Parameters
|
||||
|
153
libs/langchain/langchain/evaluation/parsing/base.py
Normal file
153
libs/langchain/langchain/evaluation/parsing/base.py
Normal file
@ -0,0 +1,153 @@
|
||||
"""Evaluators for parsing strings."""
|
||||
from operator import eq
|
||||
from typing import Any, Callable, Optional, Union, cast
|
||||
|
||||
from langchain.evaluation.schema import StringEvaluator
|
||||
from langchain.output_parsers.json import parse_json_markdown
|
||||
|
||||
|
||||
class JsonValidityEvaluator(StringEvaluator):
|
||||
"""Evaluates whether the prediction is valid JSON.
|
||||
|
||||
This evaluator checks if the prediction is a valid JSON string. It does not
|
||||
require any input or reference.
|
||||
|
||||
Attributes:
|
||||
requires_input (bool): Whether this evaluator requires an input
|
||||
string. Always False.
|
||||
requires_reference (bool): Whether this evaluator requires a
|
||||
reference string. Always False.
|
||||
evaluation_name (str): The name of the evaluation metric.
|
||||
Always "json".
|
||||
|
||||
Examples:
|
||||
>>> evaluator = JsonValidityEvaluator()
|
||||
>>> prediction = '{"name": "John", "age": 30, "city": "New York"}'
|
||||
>>> evaluator.evaluate(prediction)
|
||||
{'score': 1}
|
||||
|
||||
>>> prediction = '{"name": "John", "age": 30, "city": "New York",}'
|
||||
>>> evaluator.evaluate(prediction)
|
||||
{'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes'}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "json_validity"
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
prediction: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
**kwargs: Any
|
||||
) -> dict:
|
||||
"""Evaluate the prediction string.
|
||||
|
||||
Args:
|
||||
prediction (str): The prediction string to evaluate.
|
||||
input (str, optional): Not used in this evaluator. Defaults to None.
|
||||
reference (str, optional): Not used in this evaluator. Defaults to None.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the evaluation score. The score is 1 if
|
||||
the prediction is valid JSON, and 0 otherwise.
|
||||
If the prediction is not valid JSON, the dictionary also contains
|
||||
a "reasoning" field with the error message.
|
||||
|
||||
"""
|
||||
try:
|
||||
parse_json_markdown(prediction)
|
||||
return {"score": 1}
|
||||
except Exception as e:
|
||||
return {"score": 0, "reasoning": str(e)}
|
||||
|
||||
|
||||
class JsonEqualityEvaluator(StringEvaluator):
|
||||
"""Evaluates whether the prediction is equal to the reference after
|
||||
parsing both as JSON.
|
||||
|
||||
This evaluator checks if the prediction, after parsing as JSON, is equal
|
||||
to the reference,
|
||||
which is also parsed as JSON. It does not require an input string.
|
||||
|
||||
Attributes:
|
||||
requires_input (bool): Whether this evaluator requires an
|
||||
input string. Always False.
|
||||
requires_reference (bool): Whether this evaluator requires
|
||||
a reference string. Always True.
|
||||
evaluation_name (str): The name of the evaluation metric.
|
||||
Always "parsed_equality".
|
||||
|
||||
Examples:
|
||||
>>> evaluator = JsonEqualityEvaluator()
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
|
||||
{'score': True}
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
|
||||
{'score': False}
|
||||
|
||||
>>> evaluator = JsonEqualityEvaluator(operator=lambda x, y: x['a'] == y['a'])
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
|
||||
{'score': True}
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
|
||||
{'score': False}
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, operator: Optional[Callable] = None, **kwargs: Any) -> None:
|
||||
super().__init__()
|
||||
self.operator = operator or eq
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "json_equality"
|
||||
|
||||
def _parse_json(
|
||||
self, string: str
|
||||
) -> Union[dict, list, None, float, bool, int, str]:
|
||||
return parse_json_markdown(string)
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
prediction: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
**kwargs: Any
|
||||
) -> dict:
|
||||
"""Evaluate the prediction string.
|
||||
|
||||
Args:
|
||||
prediction (str): The prediction string to evaluate.
|
||||
input (str, optional): Not used in this evaluator.
|
||||
reference (str): The reference string to compare against.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the evaluation score.
|
||||
"""
|
||||
parsed = self._parse_json(prediction)
|
||||
label = self._parse_json(cast(str, reference))
|
||||
if isinstance(label, list):
|
||||
if not isinstance(parsed, list):
|
||||
return {"score": 0}
|
||||
parsed = sorted(parsed, key=lambda x: str(x))
|
||||
label = sorted(label, key=lambda x: str(x))
|
||||
return {"score": self.operator(parsed, label)}
|
@ -1,9 +1,11 @@
|
||||
"""Interfaces to be implemented by general evaluators."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from functools import partial
|
||||
from typing import Any, Optional, Sequence, Tuple
|
||||
from warnings import warn
|
||||
|
||||
@ -48,6 +50,10 @@ class EvaluatorType(str, Enum):
|
||||
"""Compare a prediction to a reference label using embedding distance."""
|
||||
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
|
||||
"""Compare two predictions using embedding distance."""
|
||||
JSON_VALIDITY = "json_validity"
|
||||
"""Check if a prediction is valid JSON."""
|
||||
JSON_EQUALITY = "json_equality"
|
||||
"""Check if a prediction is equal to a reference JSON."""
|
||||
|
||||
|
||||
class LLMEvalChain(Chain):
|
||||
@ -115,7 +121,7 @@ class StringEvaluator(_EvalArgsMixin, ABC):
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
"""The name of the evaluation."""
|
||||
raise NotImplementedError()
|
||||
return self.__class__.__name__
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
@ -168,9 +174,15 @@ class StringEvaluator(_EvalArgsMixin, ABC):
|
||||
- value: the string value of the evaluation, if applicable.
|
||||
- reasoning: the reasoning for the evaluation, if applicable.
|
||||
""" # noqa: E501
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} hasn't implemented an async "
|
||||
"aevaluate_strings method."
|
||||
return await asyncio.get_running_loop().run_in_executor(
|
||||
None,
|
||||
partial(
|
||||
self._evaluate_strings,
|
||||
prediction=prediction,
|
||||
reference=reference,
|
||||
input=input,
|
||||
**kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def evaluate_strings(
|
||||
@ -265,9 +277,16 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
|
||||
Returns:
|
||||
dict: A dictionary containing the preference, scores, and/or other information.
|
||||
""" # noqa: E501
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} hasn't implemented an async "
|
||||
"aevaluate_string_pairs method."
|
||||
return await asyncio.get_running_loop().run_in_executor(
|
||||
None,
|
||||
partial(
|
||||
self._evaluate_string_pairs,
|
||||
prediction=prediction,
|
||||
prediction_b=prediction_b,
|
||||
reference=reference,
|
||||
input=input,
|
||||
**kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def evaluate_string_pairs(
|
||||
@ -381,9 +400,16 @@ class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
|
||||
Returns:
|
||||
dict: The evaluation result.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} hasn't implemented an async "
|
||||
"aevaluate_agent_trajectory method."
|
||||
return await asyncio.get_running_loop().run_in_executor(
|
||||
None,
|
||||
partial(
|
||||
self._evaluate_agent_trajectory,
|
||||
prediction=prediction,
|
||||
agent_trajectory=agent_trajectory,
|
||||
reference=reference,
|
||||
input=input,
|
||||
**kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def evaluate_agent_trajectory(
|
||||
|
@ -83,7 +83,9 @@ class RunEvalConfig(BaseModel):
|
||||
The language model to pass to any evaluators that use a language model.
|
||||
""" # noqa: E501
|
||||
|
||||
evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list)
|
||||
evaluators: List[Union[EvaluatorType, str, EvalConfig]] = Field(
|
||||
default_factory=list
|
||||
)
|
||||
"""Configurations for which evaluators to apply to the dataset run.
|
||||
Each can be the string of an
|
||||
:class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
|
||||
@ -239,4 +241,22 @@ class RunEvalConfig(BaseModel):
|
||||
llm: Optional[BaseLanguageModel] = None
|
||||
prompt: Optional[BasePromptTemplate] = None
|
||||
|
||||
class JsonValidity(EvalConfig):
|
||||
"""Configuration for a json validity evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_VALIDITY
|
||||
|
||||
class JsonEqualityEvaluator(EvalConfig):
|
||||
"""Configuration for a json equality evaluator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
"""
|
||||
|
||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
||||
|
||||
# TODO: Trajectory
|
||||
|
@ -462,7 +462,7 @@ def _determine_reference_key(
|
||||
|
||||
|
||||
def _construct_run_evaluator(
|
||||
eval_config: Union[EvaluatorType, EvalConfig],
|
||||
eval_config: Union[EvaluatorType, str, EvalConfig],
|
||||
eval_llm: BaseLanguageModel,
|
||||
run_type: str,
|
||||
data_type: DataType,
|
||||
@ -471,7 +471,9 @@ def _construct_run_evaluator(
|
||||
input_key: Optional[str],
|
||||
prediction_key: Optional[str],
|
||||
) -> RunEvaluator:
|
||||
if isinstance(eval_config, EvaluatorType):
|
||||
if isinstance(eval_config, (EvaluatorType, str)):
|
||||
if not isinstance(eval_config, EvaluatorType):
|
||||
eval_config = EvaluatorType(eval_config)
|
||||
evaluator_ = load_evaluator(eval_config, llm=eval_llm)
|
||||
eval_type_tag = eval_config.value
|
||||
else:
|
||||
@ -1310,7 +1312,7 @@ def _handle_coroutine(coro: Coroutine) -> Any:
|
||||
except RuntimeError: # No event loop
|
||||
return asyncio.run(coro)
|
||||
if loop.is_running():
|
||||
return loop.create_task(coro)
|
||||
return loop.run_until_complete(coro)
|
||||
else:
|
||||
return asyncio.run(coro)
|
||||
|
||||
|
177
libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py
Normal file
177
libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py
Normal file
@ -0,0 +1,177 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.evaluation.parsing.base import (
|
||||
JsonEqualityEvaluator,
|
||||
JsonValidityEvaluator,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_validity_evaluator() -> JsonValidityEvaluator:
|
||||
return JsonValidityEvaluator()
|
||||
|
||||
|
||||
def test_json_validity_evaluator_requires_input(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.requires_input is False
|
||||
|
||||
|
||||
def test_json_validity_evaluator_requires_reference(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.requires_reference is False
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluation_name(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.evaluation_name == "json_validity"
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluate_valid_json(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"name": "John", "age": 30, "city": "New York"}'
|
||||
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
|
||||
assert result == {"score": 1}
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluate_invalid_json(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"name": "John", "age": 30, "city": "New York",}'
|
||||
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
|
||||
assert result["score"] == 0
|
||||
assert result["reasoning"].startswith(
|
||||
"Expecting property name enclosed in double quotes"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_equality_evaluator() -> JsonEqualityEvaluator:
|
||||
return JsonEqualityEvaluator()
|
||||
|
||||
|
||||
def test_json_equality_evaluator_requires_input(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.requires_input is False
|
||||
|
||||
|
||||
def test_json_equality_evaluator_requires_reference(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.requires_reference is True
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluation_name(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.evaluation_name == "json_equality"
|
||||
|
||||
|
||||
def test_json_equality_evaluator_parse_json(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
string = '{"a": 1}'
|
||||
result = json_equality_evaluator._parse_json(string)
|
||||
assert result == {"a": 1}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_equal(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 1}'
|
||||
result = json_equality_evaluator.evaluate_strings(
|
||||
prediction=prediction, reference=reference
|
||||
)
|
||||
assert result == {"score": True}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_not_equal(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 2}'
|
||||
result = json_equality_evaluator.evaluate_strings(
|
||||
prediction=prediction, reference=reference
|
||||
)
|
||||
assert result == {"score": False}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_custom_operator_equal() -> None:
|
||||
def operator(x: dict, y: dict) -> bool:
|
||||
return x["a"] == y["a"]
|
||||
|
||||
evaluator = JsonEqualityEvaluator(operator=operator)
|
||||
prediction = '{"a": 1, "b": 2}'
|
||||
reference = '{"a": 1, "c": 3}'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_custom_operator_not_equal() -> None:
|
||||
def operator(x: dict, y: dict) -> bool:
|
||||
return x["a"] == y["a"]
|
||||
|
||||
evaluator = JsonEqualityEvaluator(operator=operator)
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 2}'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_lists_permutation_invariant() -> None:
|
||||
evaluator = JsonEqualityEvaluator()
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}, {"a": 3, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"b": 2,"a": 1}, {"a": 3, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=reference, reference=prediction)
|
||||
assert result == {"score": False}
|
||||
|
||||
# Limit tests
|
||||
prediction = (
|
||||
"[" + ",".join([f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]) + "]"
|
||||
)
|
||||
rlist = [f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]
|
||||
random.shuffle(rlist)
|
||||
reference = "[" + ",".join(rlist) + "]"
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
prediction = (
|
||||
"[" + ",".join([f'{{"b": {i+1}, "a": {i}}}' for i in range(1000)]) + "]"
|
||||
)
|
||||
reference = (
|
||||
"["
|
||||
+ ",".join(
|
||||
[f'{{"a": {i+1}, "b": {i+2}}}' for i in range(999)]
|
||||
+ ['{"a": 1000, "b": 1001}']
|
||||
)
|
||||
+ "]"
|
||||
)
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
@ -40,6 +40,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
|
||||
EvaluatorType.LABELED_CRITERIA,
|
||||
EvaluatorType.LABELED_PAIRWISE_STRING,
|
||||
],
|
||||
[EvaluatorType.JSON_EQUALITY],
|
||||
],
|
||||
)
|
||||
def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user