Make pairwise comparison chain more like LLM as a judge (#11013)

--------- Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com>
2025-07-17 10:13:29 +00:00 · 2023-09-26 21:19:04 +01:00 · 2023-09-26 21:19:04 +01:00 · 64385c4eae
commit 64385c4eae
parent 175ef0a55d
3 changed files with 90 additions and 77 deletions
--- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py
@ -1,12 +1,20 @@
 """Base classes for comparing the output of two models."""
 from __future__ import annotations

+import logging
+import re
 from typing import Any, Dict, List, Optional, Union

 from langchain.callbacks.manager import Callbacks
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
-from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
+from langchain.chat_models.azure_openai import AzureChatOpenAI
+from langchain.chat_models.openai import ChatOpenAI
+from langchain.evaluation.comparison.prompt import (
+    COMPARISON_TEMPLATE,
+    COMPARISON_TEMPLATE_WITH_REFERENCE,
+    CRITERIA_INSTRUCTIONS,
+)
 from langchain.evaluation.criteria.eval_chain import (
    CRITERIA_TYPE,
    Criteria,
@ -17,6 +25,10 @@ from langchain.pydantic_v1 import Extra, Field
 from langchain.schema import RUN_KEY, BaseOutputParser
 from langchain.schema.language_model import BaseLanguageModel

+logger = logging.getLogger(__name__)
+
+_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]")
+
 _SUPPORTED_CRITERIA = {
    Criteria.CONCISENESS: "Is the submission concise and to the point?",
    Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?",
@ -112,27 +124,26 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
            ValueError: If the verdict is invalid.

        """
-        parsed = text.strip().rsplit("\n", maxsplit=1)
-        if len(parsed) == 1:
-            reasoning = ""
-            verdict = parsed[0]
-        else:
-            reasoning, verdict = parsed
-        verdict = verdict.strip("[").strip("]")
-        if verdict not in {"A", "B", "C"}:
+        match = _FIND_DOUBLE_BRACKETS.search(text)
+
+        if match:
+            verdict = match.group(1)
+
+        if not match or verdict not in {"A", "B", "C"}:
            raise ValueError(
-                f"Invalid verdict: {verdict}. "
-                "Verdict must be one of 'A', 'B', or 'C'."
+                f"Invalid output: {text}. "
+                "Output must contain a double bracketed string\
+                 with the verdict 'A', 'B', or 'C'."
            )
        # C means the models are tied. Return 'None' meaning no preference
        verdict_ = None if verdict == "C" else verdict
        score = {
            "A": 1,
            "B": 0,
-            None: 0.5,
-        }.get(verdict_)
+            "C": 0.5,
+        }[verdict]
        return {
-            "reasoning": reasoning,
+            "reasoning": text,
            "value": verdict_,
            "score": score,
        }
@ -225,7 +236,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
        """Initialize the PairwiseStringEvalChain from an LLM.

        Args:
-            llm (BaseLanguageModel): The LLM to use.
+            llm (BaseChatModel): The LLM to use (GPT-4 recommended).
            prompt (PromptTemplate, optional): The prompt to use.
            **kwargs (Any): Additional keyword arguments.

@ -236,8 +247,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
            ValueError: If the input variables are not as expected.

        """
+        if not (
+            isinstance(llm, (ChatOpenAI, AzureChatOpenAI))
+            and llm.model_name.startswith("gpt-4")
+        ):
+            logger.warning(
+                "This chain was only tested with GPT-4. \
+Performance may be significantly worse with other models."
+            )
+
        expected_input_vars = {"prediction", "prediction_b", "input", "criteria"}
-        prompt_ = prompt or PROMPT
+        prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="")
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@ -245,6 +265,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain):
            )
        criteria_ = resolve_pairwise_criteria(criteria)
        criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)

    def _prepare_input(
@ -418,7 +439,7 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
            "reference",
            "criteria",
        }
-        prompt_ = prompt or PROMPT_WITH_REFERENCE
+        prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE
        if expected_input_vars != set(prompt_.input_variables):
            raise ValueError(
                f"Input variables should be {expected_input_vars}, "
@ -426,4 +447,5 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain):
            )
        criteria_ = resolve_pairwise_criteria(criteria)
        criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
+        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
--- a/libs/langchain/langchain/evaluation/comparison/prompt.py
+++ b/libs/langchain/langchain/evaluation/comparison/prompt.py
@ -5,64 +5,55 @@ and answers the question. The prompt is based on the paper from
 Zheng, et. al. https://arxiv.org/abs/2306.05685
 """
 # flake8: noqa
-from langchain.prompts import PromptTemplate
+from langchain.prompts.chat import ChatPromptTemplate

-template = """Act as a fair judge and rate the two responses to the question below.\
- Choose the response that best followed the instructions and answered the question.\
- Your assessment should weigh the following criteria:
-{criteria}\
- Start by comparing both responses and give a brief rationale.\
- Avoid bias from the order of presentation or response length.
-After giving your rationale, make your final decision using this format:\
- "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
- and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
+SYSTEM_MESSAGE = 'Please act as an impartial judge and evaluate the quality \
+of the responses provided by two AI assistants to the user question displayed below. \
+You should choose the assistant that follows the user\'s instructions \
+and answers \the user\'s question better. \
+Your evaluation should consider factors such as the \
+helpfulness, relevance, accuracy, depth, creativity, \
+and level of detail of their responses. \
+Begin your evaluation by comparing the two responses and provide a short explanation. \
+Avoid any position biases and ensure that the order in which \
+the responses were presented does not influence your decision. \
+Do not allow the length of the responses to influence your evaluation. \
+Do not favor certain names of the assistants. Be as objective as possible. \
+After providing your explanation, output your final verdict by strictly following \
+this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, \
+and "[[C]]" for a tie.'

-[QUESTION]
-{input}
-[/QUESTION]
-
-[RESPONSE A]
-{prediction}
-[/RESPONSE A]
-
-[RESPONSE B]
-{prediction_b}
-[/RESPONSE B]"""
-PROMPT = PromptTemplate(
-    input_variables=["input", "prediction", "prediction_b", "criteria"],
-    template=template,
+CRITERIA_INSTRUCTIONS = (
+    "For this evaluation, you should primarily consider the following criteria:\n"
 )

-template = """Act as a fair judge and rate the two responses to the question below.\
- Choose the response that best followed the instructions and answered the question.\
- Your assessment should weigh the following criteria:
-{criteria}\
- Start by comparing both responses and give a brief rationale.\
- Avoid bias from the order of presentation or response length.\
- Weigh accuracy based on the following ground truth reference\
- answer to the question:
-
-[REFERENCE]
-{reference}
-[/REFERENCE]
-
-After giving your rationale, make your final decision using this format:\
- "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
- and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
-
-[QUESTION]
-{input}
-[/QUESTION]
-
-[RESPONSE A]
-{prediction}
-[/RESPONSE A]
-
-[RESPONSE B]
-{prediction_b}
-[/RESPONSE B]"""
-
-PROMPT_WITH_REFERENCE = PromptTemplate(
-    input_variables=["input", "prediction", "prediction_b", "reference", "criteria"],
-    template=template,
+COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "{criteria}[User Question]\n{input}\n\n\
+[The Start of Assistant A's Answer]\n{prediction}\n\
+[The End of Assistant A's Answer]\
+\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
+[The End of Assistant B's Answer]",
+        ),
+    ]
+)
+
+COMPARISON_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
+    [
+        ("system", SYSTEM_MESSAGE),
+        (
+            "human",
+            "{criteria}\n\nTo help you evaluate the responses, \
+here is a reference answer to the user's question:\n\
+{reference}\
+[User Question]\n{input}\n\n\
+[The Start of Assistant A's Answer]\n{prediction}\n\
+[The End of Assistant A's Answer]\
+\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\
+[The End of Assistant B's Answer]",
+        ),
+    ]
 )
--- a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@ -34,7 +34,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[A]]"""
    got = output_parser.parse(text)
    want = {
-        "reasoning": "I like pie better than cake.",
+        "reasoning": text,
        "value": "A",
        "score": 1,
    }
@ -46,7 +46,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[B]]"""
    got = output_parser.parse(text)
    want = {
-        "reasoning": "I like cake better than pie.",
+        "reasoning": text,
        "value": "B",
        "score": 0,
    }
@ -58,7 +58,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None:
 [[C]]"""
    got = output_parser.parse(text)
    want = {
-        "reasoning": "I like cake and pie.",
+        "reasoning": text,
        "value": None,
        "score": 0.5,
    }
@ -84,7 +84,7 @@ def test_pairwise_string_comparison_chain() -> None:
    )
    assert res["value"] is None
    assert res["score"] == 0.5
-    assert res["reasoning"] == "The values are the same."
+    assert res["reasoning"] == "The values are the same.\n[[C]]"
    res = chain.evaluate_string_pairs(
        prediction="I like pie.",
        prediction_b="I like pie.",