From 2519580994d95f325ae80cb56f77f22a7157b710 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Tue, 15 Aug 2023 17:17:32 -0700 Subject: [PATCH] Add Schema Evals (#9228) Simple eval checks for whether a generation is valid json and whether it matches an expected dict --- .../langchain/langchain/evaluation/loading.py | 16 +- .../langchain/evaluation/parsing/__init__.py | 0 .../langchain/evaluation/parsing/base.py | 153 +++++++++++++++ libs/langchain/langchain/evaluation/schema.py | 46 ++++- .../langchain/smith/evaluation/config.py | 22 ++- .../smith/evaluation/runner_utils.py | 8 +- .../unit_tests/evaluation/parsing/__init__.py | 0 .../evaluation/parsing/test_base.py | 177 ++++++++++++++++++ .../unit_tests/evaluation/test_loading.py | 1 + 9 files changed, 405 insertions(+), 18 deletions(-) create mode 100644 libs/langchain/langchain/evaluation/parsing/__init__.py create mode 100644 libs/langchain/langchain/evaluation/parsing/base.py create mode 100644 libs/langchain/tests/unit_tests/evaluation/parsing/__init__.py create mode 100644 libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py diff --git a/libs/langchain/langchain/evaluation/loading.py b/libs/langchain/langchain/evaluation/loading.py index c608c259a34..81ffec6d797 100644 --- a/libs/langchain/langchain/evaluation/loading.py +++ b/libs/langchain/langchain/evaluation/loading.py @@ -14,8 +14,12 @@ from langchain.evaluation.embedding_distance.base import ( EmbeddingDistanceEvalChain, PairwiseEmbeddingDistanceEvalChain, ) +from langchain.evaluation.parsing.base import ( + JsonEqualityEvaluator, + JsonValidityEvaluator, +) from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain -from langchain.evaluation.schema import EvaluatorType, LLMEvalChain +from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator from langchain.evaluation.string_distance.base import ( PairwiseStringDistanceEvalChain, StringDistanceEvalChain, @@ -57,7 +61,9 @@ def load_dataset(uri: str) -> List[Dict]: return [d for d in dataset["train"]] -_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = { +_EVALUATOR_MAP: Dict[ + EvaluatorType, Union[Type[LLMEvalChain], Type[Chain], Type[StringEvaluator]] +] = { EvaluatorType.QA: QAEvalChain, EvaluatorType.COT_QA: CotQAEvalChain, EvaluatorType.CONTEXT_QA: ContextQAEvalChain, @@ -70,6 +76,8 @@ _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = { EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain, EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain, EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain, + EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator, + EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator, } @@ -78,7 +86,7 @@ def load_evaluator( *, llm: Optional[BaseLanguageModel] = None, **kwargs: Any, -) -> Chain: +) -> Union[Chain, StringEvaluator]: """Load the requested evaluation chain specified by a string. Parameters @@ -119,7 +127,7 @@ def load_evaluators( llm: Optional[BaseLanguageModel] = None, config: Optional[dict] = None, **kwargs: Any, -) -> List[Chain]: +) -> List[Union[Chain, StringEvaluator]]: """Load evaluators specified by a list of evaluator types. Parameters diff --git a/libs/langchain/langchain/evaluation/parsing/__init__.py b/libs/langchain/langchain/evaluation/parsing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/libs/langchain/langchain/evaluation/parsing/base.py b/libs/langchain/langchain/evaluation/parsing/base.py new file mode 100644 index 00000000000..d2233b51481 --- /dev/null +++ b/libs/langchain/langchain/evaluation/parsing/base.py @@ -0,0 +1,153 @@ +"""Evaluators for parsing strings.""" +from operator import eq +from typing import Any, Callable, Optional, Union, cast + +from langchain.evaluation.schema import StringEvaluator +from langchain.output_parsers.json import parse_json_markdown + + +class JsonValidityEvaluator(StringEvaluator): + """Evaluates whether the prediction is valid JSON. + + This evaluator checks if the prediction is a valid JSON string. It does not + require any input or reference. + + Attributes: + requires_input (bool): Whether this evaluator requires an input + string. Always False. + requires_reference (bool): Whether this evaluator requires a + reference string. Always False. + evaluation_name (str): The name of the evaluation metric. + Always "json". + + Examples: + >>> evaluator = JsonValidityEvaluator() + >>> prediction = '{"name": "John", "age": 30, "city": "New York"}' + >>> evaluator.evaluate(prediction) + {'score': 1} + + >>> prediction = '{"name": "John", "age": 30, "city": "New York",}' + >>> evaluator.evaluate(prediction) + {'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes'} + """ + + def __init__(self, **kwargs: Any) -> None: + super().__init__() + + @property + def requires_input(self) -> bool: + return False + + @property + def requires_reference(self) -> bool: + return False + + @property + def evaluation_name(self) -> str: + return "json_validity" + + def _evaluate_strings( + self, + prediction: str, + input: Optional[str] = None, + reference: Optional[str] = None, + **kwargs: Any + ) -> dict: + """Evaluate the prediction string. + + Args: + prediction (str): The prediction string to evaluate. + input (str, optional): Not used in this evaluator. Defaults to None. + reference (str, optional): Not used in this evaluator. Defaults to None. + + Returns: + dict: A dictionary containing the evaluation score. The score is 1 if + the prediction is valid JSON, and 0 otherwise. + If the prediction is not valid JSON, the dictionary also contains + a "reasoning" field with the error message. + + """ + try: + parse_json_markdown(prediction) + return {"score": 1} + except Exception as e: + return {"score": 0, "reasoning": str(e)} + + +class JsonEqualityEvaluator(StringEvaluator): + """Evaluates whether the prediction is equal to the reference after + parsing both as JSON. + + This evaluator checks if the prediction, after parsing as JSON, is equal + to the reference, + which is also parsed as JSON. It does not require an input string. + + Attributes: + requires_input (bool): Whether this evaluator requires an + input string. Always False. + requires_reference (bool): Whether this evaluator requires + a reference string. Always True. + evaluation_name (str): The name of the evaluation metric. + Always "parsed_equality". + + Examples: + >>> evaluator = JsonEqualityEvaluator() + >>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}') + {'score': True} + >>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}') + {'score': False} + + >>> evaluator = JsonEqualityEvaluator(operator=lambda x, y: x['a'] == y['a']) + >>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}') + {'score': True} + >>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}') + {'score': False} + + """ + + def __init__(self, operator: Optional[Callable] = None, **kwargs: Any) -> None: + super().__init__() + self.operator = operator or eq + + @property + def requires_input(self) -> bool: + return False + + @property + def requires_reference(self) -> bool: + return True + + @property + def evaluation_name(self) -> str: + return "json_equality" + + def _parse_json( + self, string: str + ) -> Union[dict, list, None, float, bool, int, str]: + return parse_json_markdown(string) + + def _evaluate_strings( + self, + prediction: str, + input: Optional[str] = None, + reference: Optional[str] = None, + **kwargs: Any + ) -> dict: + """Evaluate the prediction string. + + Args: + prediction (str): The prediction string to evaluate. + input (str, optional): Not used in this evaluator. + reference (str): The reference string to compare against. + + Returns: + dict: A dictionary containing the evaluation score. + """ + parsed = self._parse_json(prediction) + label = self._parse_json(cast(str, reference)) + if isinstance(label, list): + if not isinstance(parsed, list): + return {"score": 0} + parsed = sorted(parsed, key=lambda x: str(x)) + label = sorted(label, key=lambda x: str(x)) + return {"score": self.operator(parsed, label)} diff --git a/libs/langchain/langchain/evaluation/schema.py b/libs/langchain/langchain/evaluation/schema.py index ef6344fd460..f08402c8c28 100644 --- a/libs/langchain/langchain/evaluation/schema.py +++ b/libs/langchain/langchain/evaluation/schema.py @@ -1,9 +1,11 @@ """Interfaces to be implemented by general evaluators.""" from __future__ import annotations +import asyncio import logging from abc import ABC, abstractmethod from enum import Enum +from functools import partial from typing import Any, Optional, Sequence, Tuple from warnings import warn @@ -48,6 +50,10 @@ class EvaluatorType(str, Enum): """Compare a prediction to a reference label using embedding distance.""" PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance" """Compare two predictions using embedding distance.""" + JSON_VALIDITY = "json_validity" + """Check if a prediction is valid JSON.""" + JSON_EQUALITY = "json_equality" + """Check if a prediction is equal to a reference JSON.""" class LLMEvalChain(Chain): @@ -115,7 +121,7 @@ class StringEvaluator(_EvalArgsMixin, ABC): @property def evaluation_name(self) -> str: """The name of the evaluation.""" - raise NotImplementedError() + return self.__class__.__name__ @property def requires_reference(self) -> bool: @@ -168,9 +174,15 @@ class StringEvaluator(_EvalArgsMixin, ABC): - value: the string value of the evaluation, if applicable. - reasoning: the reasoning for the evaluation, if applicable. """ # noqa: E501 - raise NotImplementedError( - f"{self.__class__.__name__} hasn't implemented an async " - "aevaluate_strings method." + return await asyncio.get_running_loop().run_in_executor( + None, + partial( + self._evaluate_strings, + prediction=prediction, + reference=reference, + input=input, + **kwargs, + ), ) def evaluate_strings( @@ -265,9 +277,16 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC): Returns: dict: A dictionary containing the preference, scores, and/or other information. """ # noqa: E501 - raise NotImplementedError( - f"{self.__class__.__name__} hasn't implemented an async " - "aevaluate_string_pairs method." + return await asyncio.get_running_loop().run_in_executor( + None, + partial( + self._evaluate_string_pairs, + prediction=prediction, + prediction_b=prediction_b, + reference=reference, + input=input, + **kwargs, + ), ) def evaluate_string_pairs( @@ -381,9 +400,16 @@ class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC): Returns: dict: The evaluation result. """ - raise NotImplementedError( - f"{self.__class__.__name__} hasn't implemented an async " - "aevaluate_agent_trajectory method." + return await asyncio.get_running_loop().run_in_executor( + None, + partial( + self._evaluate_agent_trajectory, + prediction=prediction, + agent_trajectory=agent_trajectory, + reference=reference, + input=input, + **kwargs, + ), ) def evaluate_agent_trajectory( diff --git a/libs/langchain/langchain/smith/evaluation/config.py b/libs/langchain/langchain/smith/evaluation/config.py index 3bddffd2cbe..f369a20769b 100644 --- a/libs/langchain/langchain/smith/evaluation/config.py +++ b/libs/langchain/langchain/smith/evaluation/config.py @@ -83,7 +83,9 @@ class RunEvalConfig(BaseModel): The language model to pass to any evaluators that use a language model. """ # noqa: E501 - evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list) + evaluators: List[Union[EvaluatorType, str, EvalConfig]] = Field( + default_factory=list + ) """Configurations for which evaluators to apply to the dataset run. Each can be the string of an :class:`EvaluatorType `, such @@ -239,4 +241,22 @@ class RunEvalConfig(BaseModel): llm: Optional[BaseLanguageModel] = None prompt: Optional[BasePromptTemplate] = None + class JsonValidity(EvalConfig): + """Configuration for a json validity evaluator. + + Parameters + ---------- + """ + + evaluator_type: EvaluatorType = EvaluatorType.JSON_VALIDITY + + class JsonEqualityEvaluator(EvalConfig): + """Configuration for a json equality evaluator. + + Parameters + ---------- + """ + + evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY + # TODO: Trajectory diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index 88cd3f89860..64139f95e30 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -462,7 +462,7 @@ def _determine_reference_key( def _construct_run_evaluator( - eval_config: Union[EvaluatorType, EvalConfig], + eval_config: Union[EvaluatorType, str, EvalConfig], eval_llm: BaseLanguageModel, run_type: str, data_type: DataType, @@ -471,7 +471,9 @@ def _construct_run_evaluator( input_key: Optional[str], prediction_key: Optional[str], ) -> RunEvaluator: - if isinstance(eval_config, EvaluatorType): + if isinstance(eval_config, (EvaluatorType, str)): + if not isinstance(eval_config, EvaluatorType): + eval_config = EvaluatorType(eval_config) evaluator_ = load_evaluator(eval_config, llm=eval_llm) eval_type_tag = eval_config.value else: @@ -1310,7 +1312,7 @@ def _handle_coroutine(coro: Coroutine) -> Any: except RuntimeError: # No event loop return asyncio.run(coro) if loop.is_running(): - return loop.create_task(coro) + return loop.run_until_complete(coro) else: return asyncio.run(coro) diff --git a/libs/langchain/tests/unit_tests/evaluation/parsing/__init__.py b/libs/langchain/tests/unit_tests/evaluation/parsing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py b/libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py new file mode 100644 index 00000000000..9a10224f160 --- /dev/null +++ b/libs/langchain/tests/unit_tests/evaluation/parsing/test_base.py @@ -0,0 +1,177 @@ +import random + +import pytest + +from langchain.evaluation.parsing.base import ( + JsonEqualityEvaluator, + JsonValidityEvaluator, +) + + +@pytest.fixture +def json_validity_evaluator() -> JsonValidityEvaluator: + return JsonValidityEvaluator() + + +def test_json_validity_evaluator_requires_input( + json_validity_evaluator: JsonValidityEvaluator, +) -> None: + assert json_validity_evaluator.requires_input is False + + +def test_json_validity_evaluator_requires_reference( + json_validity_evaluator: JsonValidityEvaluator, +) -> None: + assert json_validity_evaluator.requires_reference is False + + +def test_json_validity_evaluator_evaluation_name( + json_validity_evaluator: JsonValidityEvaluator, +) -> None: + assert json_validity_evaluator.evaluation_name == "json_validity" + + +def test_json_validity_evaluator_evaluate_valid_json( + json_validity_evaluator: JsonValidityEvaluator, +) -> None: + prediction = '{"name": "John", "age": 30, "city": "New York"}' + result = json_validity_evaluator.evaluate_strings(prediction=prediction) + assert result == {"score": 1} + + +def test_json_validity_evaluator_evaluate_invalid_json( + json_validity_evaluator: JsonValidityEvaluator, +) -> None: + prediction = '{"name": "John", "age": 30, "city": "New York",}' + result = json_validity_evaluator.evaluate_strings(prediction=prediction) + assert result["score"] == 0 + assert result["reasoning"].startswith( + "Expecting property name enclosed in double quotes" + ) + + +@pytest.fixture +def json_equality_evaluator() -> JsonEqualityEvaluator: + return JsonEqualityEvaluator() + + +def test_json_equality_evaluator_requires_input( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + assert json_equality_evaluator.requires_input is False + + +def test_json_equality_evaluator_requires_reference( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + assert json_equality_evaluator.requires_reference is True + + +def test_json_equality_evaluator_evaluation_name( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + assert json_equality_evaluator.evaluation_name == "json_equality" + + +def test_json_equality_evaluator_parse_json( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + string = '{"a": 1}' + result = json_equality_evaluator._parse_json(string) + assert result == {"a": 1} + + +def test_json_equality_evaluator_evaluate_strings_equal( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + prediction = '{"a": 1}' + reference = '{"a": 1}' + result = json_equality_evaluator.evaluate_strings( + prediction=prediction, reference=reference + ) + assert result == {"score": True} + + +def test_json_equality_evaluator_evaluate_strings_not_equal( + json_equality_evaluator: JsonEqualityEvaluator, +) -> None: + prediction = '{"a": 1}' + reference = '{"a": 2}' + result = json_equality_evaluator.evaluate_strings( + prediction=prediction, reference=reference + ) + assert result == {"score": False} + + +def test_json_equality_evaluator_evaluate_strings_custom_operator_equal() -> None: + def operator(x: dict, y: dict) -> bool: + return x["a"] == y["a"] + + evaluator = JsonEqualityEvaluator(operator=operator) + prediction = '{"a": 1, "b": 2}' + reference = '{"a": 1, "c": 3}' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": True} + + +def test_json_equality_evaluator_evaluate_strings_custom_operator_not_equal() -> None: + def operator(x: dict, y: dict) -> bool: + return x["a"] == y["a"] + + evaluator = JsonEqualityEvaluator(operator=operator) + prediction = '{"a": 1}' + reference = '{"a": 2}' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": False} + + +def test_json_equality_evaluator_evaluate_lists_permutation_invariant() -> None: + evaluator = JsonEqualityEvaluator() + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}]' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": True} + + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 4}]' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": False} + + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 2, "b": 3}]' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": False} + + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}, {"a": 3, "b": 4}]' + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": False} + + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 2, "b": 3}, {"b": 2,"a": 1}, {"a": 3, "b": 4}]' + result = evaluator.evaluate_strings(prediction=reference, reference=prediction) + assert result == {"score": False} + + # Limit tests + prediction = ( + "[" + ",".join([f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]) + "]" + ) + rlist = [f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)] + random.shuffle(rlist) + reference = "[" + ",".join(rlist) + "]" + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": True} + + prediction = ( + "[" + ",".join([f'{{"b": {i+1}, "a": {i}}}' for i in range(1000)]) + "]" + ) + reference = ( + "[" + + ",".join( + [f'{{"a": {i+1}, "b": {i+2}}}' for i in range(999)] + + ['{"a": 1000, "b": 1001}'] + ) + + "]" + ) + result = evaluator.evaluate_strings(prediction=prediction, reference=reference) + assert result == {"score": False} diff --git a/libs/langchain/tests/unit_tests/evaluation/test_loading.py b/libs/langchain/tests/unit_tests/evaluation/test_loading.py index 11715e92503..8b1e5938919 100644 --- a/libs/langchain/tests/unit_tests/evaluation/test_loading.py +++ b/libs/langchain/tests/unit_tests/evaluation/test_loading.py @@ -40,6 +40,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None: EvaluatorType.LABELED_CRITERIA, EvaluatorType.LABELED_PAIRWISE_STRING, ], + [EvaluatorType.JSON_EQUALITY], ], ) def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None: