Add a Pairwise Comparison Chain (#6703)

Notebook shows preference scoring between two chains and reports wilson score interval + p value I think I'll add the option to insert ground truth labels but doesn't have to be in this PR
2025-09-13 13:36:15 +00:00 · 2023-06-26 20:47:41 -07:00
parent 2928b080f6
commit cc60fed3be
8 changed files with 888 additions and 5 deletions
--- a/tests/unit_tests/evaluation/comparison/init.py
+++ b/tests/unit_tests/evaluation/comparison/init.py
--- a/tests/unit_tests/evaluation/comparison/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/comparison/test_eval_chain.py
@@ -0,0 +1,39 @@
+"""Test the comparison chains."""
+
+
+from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+def test_pairwise_string_comparison_chain() -> None:
+    llm = FakeLLM(
+        queries={
+            "a": "The values are the same.\n[[C]]",
+            "b": "A is clearly better than b.\n[[A]]",
+            "c": "B is clearly better than a.\n[[B]]",
+        },
+        sequential_responses=True,
+    )
+    chain = PairwiseStringEvalChain.from_llm(llm=llm)
+    res = chain.evaluate_string_pairs(
+        output_a="I like pie.",
+        output_b="I love pie.",
+        input="What is your favorite food?",
+    )
+    assert res["value"] is None
+    assert res["score"] == 0.5
+    assert res["reasoning"] == "The values are the same."
+    res = chain.evaluate_string_pairs(
+        output_a="I like pie.",
+        output_b="I like pie.",
+        input="What is your favorite food?",
+    )
+    assert res["value"] == "A"
+    assert res["score"] == 1
+    res = chain.evaluate_string_pairs(
+        output_a="I like pie.",
+        output_b="I hate pie.",
+        input="What is your favorite food?",
+    )
+    assert res["value"] == "B"
+    assert res["score"] == 0