Adding an in-context QA evaluation chain + chain of thought reasoning chain for improved accuracy (#2444)

Right now, eval chains require an answer for every question. It's cumbersome to collect this ground truth so getting around this issue with 2 things: * Adding a context param in `ContextQAEvalChain` and simply evaluating if the question is answered accurately from context * Adding chain of though explanation prompting to improve the accuracy of this w/o GT. This also gets to feature parity with openai/evals which has the same contextual eval w/o GT. TODO in follow-up: * Better prompt inheritance. No need for seperate prompt for CoT reasoning. How can we merge them together --------- Co-authored-by: Vashisht Madhavan <vashishtmadhavan@Vashs-MacBook-Pro.local>
2025-09-16 15:04:13 +00:00 · 2023-04-07 01:32:41 -04:00
parent e131156805
commit aa439ac2ff
7 changed files with 250 additions and 4 deletions
--- a/tests/unit_tests/evaluation/init.py
+++ b/tests/unit_tests/evaluation/init.py
@@ -0,0 +1 @@
+"""New unit tests for the evaluation module."""
--- a/tests/unit_tests/evaluation/qa/init.py
+++ b/tests/unit_tests/evaluation/qa/init.py
@@ -0,0 +1 @@
+"""Tests for QA evaluation chains."""
--- a/tests/unit_tests/evaluation/qa/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/qa/test_eval_chain.py
@@ -0,0 +1,46 @@
+"""Test LLM Bash functionality."""
+import sys
+from typing import Type
+
+import pytest
+
+from langchain.evaluation.qa.eval_chain import (
+    ContextQAEvalChain,
+    CotQAEvalChain,
+    QAEvalChain,
+)
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+@pytest.mark.skipif(
+    sys.platform.startswith("win"), reason="Test not supported on Windows"
+)
+def test_eval_chain() -> None:
+    """Test a simple eval chain."""
+    example = {"query": "What's my name", "answer": "John Doe"}
+    prediction = {"result": "John Doe"}
+    fake_qa_eval_chain = QAEvalChain.from_llm(FakeLLM())
+
+    outputs = fake_qa_eval_chain.evaluate([example, example], [prediction, prediction])
+    assert outputs[0] == outputs[1]
+    assert "text" in outputs[0]
+    assert outputs[0]["text"] == "foo"
+
+
+@pytest.mark.skipif(
+    sys.platform.startswith("win"), reason="Test not supported on Windows"
+)
+@pytest.mark.parametrize("chain_cls", [ContextQAEvalChain, CotQAEvalChain])
+def test_context_eval_chain(chain_cls: Type[ContextQAEvalChain]) -> None:
+    """Test a simple eval chain."""
+    example = {
+        "query": "What's my name",
+        "context": "The name of this person is John Doe",
+    }
+    prediction = {"result": "John Doe"}
+    fake_qa_eval_chain = chain_cls.from_llm(FakeLLM())
+
+    outputs = fake_qa_eval_chain.evaluate([example, example], [prediction, prediction])
+    assert outputs[0] == outputs[1]
+    assert "text" in outputs[0]
+    assert outputs[0]["text"] == "foo"
				`@@ -0,0 +1 @@`
				`"""New unit tests for the evaluation module."""`