diff --git a/libs/langchain/langchain/evaluation/comparison/eval_chain.py b/libs/langchain/langchain/evaluation/comparison/eval_chain.py index d308300942e..3ca215324b5 100644 --- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py +++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py @@ -100,14 +100,14 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]): """ return "pairwise_string_result" - def parse(self, text: str) -> Any: + def parse(self, text: str) -> Dict[str, Any]: """Parse the output text. Args: text (str): The output text to parse. Returns: - Any: The parsed output. + Dict: The parsed output. Raises: ValueError: If the verdict is invalid. diff --git a/libs/langchain/langchain/evaluation/criteria/eval_chain.py b/libs/langchain/langchain/evaluation/criteria/eval_chain.py index 4de0dc43ab5..09359cf79ed 100644 --- a/libs/langchain/langchain/evaluation/criteria/eval_chain.py +++ b/libs/langchain/langchain/evaluation/criteria/eval_chain.py @@ -65,14 +65,14 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]): def _type(self) -> str: return "criteria_result" - def parse(self, text: str) -> Any: + def parse(self, text: str) -> Dict[str, Any]: """Parse the output text. Args: text (str): The output text to parse. Returns: - Any: The parsed output. + Dict: The parsed output. """ parsed = text.strip().rsplit("\n", maxsplit=1) if len(parsed) == 1: diff --git a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py index 2839a7208ff..269d3caf2b7 100644 --- a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py +++ b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py @@ -8,6 +8,7 @@ import pytest from langchain.evaluation.comparison.eval_chain import ( LabeledPairwiseStringEvalChain, PairwiseStringEvalChain, + PairwiseStringResultOutputParser, resolve_pairwise_criteria, ) from langchain.evaluation.criteria.eval_chain import Criteria @@ -27,6 +28,45 @@ def test_resolve_criteria_list_enum() -> None: assert set(val.keys()) == set(c.value for c in list(Criteria)) +def test_PairwiseStringResultOutputParser_parse() -> None: + output_parser = PairwiseStringResultOutputParser() + text = """I like pie better than cake. +[[A]]""" + got = output_parser.parse(text) + want = { + "reasoning": "I like pie better than cake.", + "value": "A", + "score": 1, + } + assert got.get("reasoning") == want["reasoning"] + assert got.get("value") == want["value"] + assert got.get("score") == want["score"] + + text = """I like cake better than pie. +[[B]]""" + got = output_parser.parse(text) + want = { + "reasoning": "I like cake better than pie.", + "value": "B", + "score": 0, + } + assert got.get("reasoning") == want["reasoning"] + assert got.get("value") == want["value"] + assert got.get("score") == want["score"] + + text = """I like cake and pie. +[[C]]""" + got = output_parser.parse(text) + want = { + "reasoning": "I like cake and pie.", + "value": None, + "score": 0.5, + } + assert got.get("reasoning") == want["reasoning"] + assert got.get("value") == want["value"] + assert got.get("score") == want["score"] + + def test_pairwise_string_comparison_chain() -> None: llm = FakeLLM( queries={ diff --git a/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py index d0ea4731a94..cc1833f42b5 100644 --- a/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py +++ b/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py @@ -7,6 +7,7 @@ from langchain.evaluation.criteria.eval_chain import ( _SUPPORTED_CRITERIA, Criteria, CriteriaEvalChain, + CriteriaResultOutputParser, LabeledCriteriaEvalChain, ) from langchain.evaluation.schema import StringEvaluator @@ -23,6 +24,34 @@ def test_resolve_criteria_str() -> None: } +def test_CriteriaResultOutputParser_parse() -> None: + output_parser = CriteriaResultOutputParser() + text = """Here is my step-by-step reasoning for the given criteria: +The criterion is: "Do you like cake?" I like cake. +Y""" + got = output_parser.parse(text) + want = { + "reasoning": """Here is my step-by-step reasoning for the given criteria: +The criterion is: "Do you like cake?" I like cake.""", + "value": "Y", + "score": 1, + } + assert got.get("reasoning") == want["reasoning"] + assert got.get("value") == want["value"] + assert got.get("score") == want["score"] + + text = "Y" + got = output_parser.parse(text) + want = { + "reasoning": "", + "value": "Y", + "score": 1, + } + assert got.get("reasoning") == want["reasoning"] + assert got.get("value") == want["value"] + assert got.get("score") == want["score"] + + @pytest.mark.parametrize("criterion", list(Criteria)) def test_resolve_criteria_enum(criterion: Criteria) -> None: assert CriteriaEvalChain.resolve_criteria(criterion) == {