Return feedback (#9629)

Return the feedback values in an eval run result Also made a helper method to display as a dataframe but it may be overkill
2025-10-21 17:25:34 +00:00 · 2023-08-28 09:15:05 -07:00
parent 5e2d0cf54e
commit cb642ef658
3 changed files with 86 additions and 45 deletions
--- a/libs/langchain/langchain/callbacks/tracers/evaluation.py
+++ b/libs/langchain/langchain/callbacks/tracers/evaluation.py
@@ -3,10 +3,11 @@ from __future__ import annotations

 import logging
 from concurrent.futures import Future, ThreadPoolExecutor, wait
-from typing import Any, List, Optional, Sequence, Set, Union
+from typing import Any, Dict, List, Optional, Sequence, Set, Union
 from uuid import UUID

-from langsmith import Client, RunEvaluator
+import langsmith
+from langsmith import schemas as langsmith_schemas

 from langchain.callbacks.manager import tracing_v2_enabled
 from langchain.callbacks.tracers.base import BaseTracer
@@ -62,13 +63,13 @@ class EvaluatorCallbackHandler(BaseTracer):
        The LangSmith project name to be organize eval chain runs under.
    """

-    name: str = "evaluator_callback_handler"
+    name = "evaluator_callback_handler"

    def __init__(
        self,
-        evaluators: Sequence[RunEvaluator],
+        evaluators: Sequence[langsmith.RunEvaluator],
        max_workers: Optional[int] = None,
-        client: Optional[Client] = None,
+        client: Optional[langsmith.Client] = None,
        example_id: Optional[Union[UUID, str]] = None,
        skip_unfinished: bool = True,
        project_name: Optional[str] = "evaluators",
@@ -86,10 +87,11 @@ class EvaluatorCallbackHandler(BaseTracer):
        self.futures: Set[Future] = set()
        self.skip_unfinished = skip_unfinished
        self.project_name = project_name
+        self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
        global _TRACERS
        _TRACERS.append(self)

-    def _evaluate_in_project(self, run: Run, evaluator: RunEvaluator) -> None:
+    def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
        """Evaluate the run in the project.

        Parameters
@@ -102,11 +104,11 @@ class EvaluatorCallbackHandler(BaseTracer):
        """
        try:
            if self.project_name is None:
-                self.client.evaluate_run(run, evaluator)
+                feedback = self.client.evaluate_run(run, evaluator)
            with tracing_v2_enabled(
                project_name=self.project_name, tags=["eval"], client=self.client
            ):
-                self.client.evaluate_run(run, evaluator)
+                feedback = self.client.evaluate_run(run, evaluator)
        except Exception as e:
            logger.error(
                f"Error evaluating run {run.id} with "
@@ -114,6 +116,8 @@ class EvaluatorCallbackHandler(BaseTracer):
                exc_info=True,
            )
            raise e
+        example_id = str(run.reference_example_id)
+        self.logged_feedback.setdefault(example_id, []).append(feedback)

    def _persist_run(self, run: Run) -> None:
        """Run the evaluator on the run.
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -11,6 +11,7 @@ import uuid
 import warnings
 from enum import Enum
 from typing import (
+    TYPE_CHECKING,
    Any,
    Callable,
    Coroutine,
@@ -44,6 +45,9 @@ from langchain.schema.runnable import Runnable, RunnableConfig, RunnableLambda
 from langchain.smith.evaluation.config import EvalConfig, RunEvalConfig
 from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain

+if TYPE_CHECKING:
+    import pandas as pd
+
 logger = logging.getLogger(__name__)

 MODEL_OR_CHAIN_FACTORY = Union[
@@ -63,6 +67,31 @@ class InputFormatError(Exception):
 ## Shared Utilities


+class TestResult(dict):
+    """A dictionary of the results of a single test run."""
+
+    def to_dataframe(self) -> pd.DataFrame:
+        """Convert the results to a dataframe."""
+        try:
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                "Pandas is required to convert the results to a dataframe."
+                " to install pandas, run `pip install pandas`."
+            ) from e
+
+        indices = []
+        records = []
+        for example_id, result in self["results"].items():
+            feedback = result["feedback"]
+            records.append(
+                {**{f.key: f.score for f in feedback}, "output": result["output"]}
+            )
+            indices.append(example_id)
+
+        return pd.DataFrame(records, index=indices)
+
+
 def _get_eval_project_url(api_url: str, project_id: str) -> str:
    """Get the project url from the api url."""
    parsed = urlparse(api_url)
@@ -667,7 +696,7 @@ async def _arun_llm_or_chain(
    tags: Optional[List[str]] = None,
    callbacks: Optional[List[BaseCallbackHandler]] = None,
    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
+) -> Union[dict, str, LLMResult, ChatResult]:
    """Asynchronously run the Chain or language model.

    Args:
@@ -689,10 +718,10 @@ async def _arun_llm_or_chain(
                tracer.example_id = example.id
    else:
        previous_example_ids = None
-    outputs = []
    chain_or_llm = (
        "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
    )
+    result = None
    try:
        if isinstance(llm_or_chain_factory, BaseLanguageModel):
            output: Any = await _arun_llm(
@@ -711,15 +740,15 @@ async def _arun_llm_or_chain(
                callbacks=callbacks,
                input_mapper=input_mapper,
            )
-        outputs.append(output)
+        result = output
    except Exception as e:
        logger.warning(f"{chain_or_llm} failed for example {example.id}. Error: {e}")
-        outputs.append({"Error": str(e)})
+        result = {"Error": str(e)}
    if callbacks and previous_example_ids:
        for example_id, tracer in zip(previous_example_ids, callbacks):
            if hasattr(tracer, "example_id"):
                tracer.example_id = example_id
-    return outputs
+    return result


 async def _gather_with_concurrency(
@@ -856,7 +885,7 @@ async def _arun_on_examples(
        wrapped_model, examples, evaluation, data_type
    )
    examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
-    results: Dict[str, List[Any]] = {}
+    results: Dict[str, dict] = {}

    async def process_example(
        example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
@@ -869,7 +898,7 @@ async def _arun_on_examples(
            callbacks=callbacks,
            input_mapper=input_mapper,
        )
-        results[str(example.id)] = result
+        results[str(example.id)] = {"output": result}
        job_state["num_processed"] += 1
        if verbose:
            print(
@@ -890,8 +919,14 @@ async def _arun_on_examples(
        ),
        *(functools.partial(process_example, e) for e in examples),
    )
+    all_feedback = {}
    for handler in evaluation_handlers:
        handler.wait_for_futures()
+        all_feedback.update(handler.logged_feedback)
+    # join the results and feedback on the example id
+    for example_id, output_dict in results.items():
+        feedback = all_feedback.get(example_id, [])
+        output_dict["feedback"] = feedback
    return results


@@ -978,7 +1013,7 @@ def _run_llm_or_chain(
    tags: Optional[List[str]] = None,
    callbacks: Optional[List[BaseCallbackHandler]] = None,
    input_mapper: Optional[Callable[[Dict], Any]] = None,
-) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
+) -> Union[dict, str, LLMResult, ChatResult]:
    """
    Run the Chain or language model synchronously.

@@ -1001,10 +1036,10 @@ def _run_llm_or_chain(
                tracer.example_id = example.id
    else:
        previous_example_ids = None
-    outputs = []
    chain_or_llm = (
        "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
    )
+    result = None
    try:
        if isinstance(llm_or_chain_factory, BaseLanguageModel):
            output: Any = _run_llm(
@@ -1023,18 +1058,18 @@ def _run_llm_or_chain(
                tags=tags,
                input_mapper=input_mapper,
            )
-        outputs.append(output)
+        result = output
    except Exception as e:
        logger.warning(
            f"{chain_or_llm} failed for example {example.id} with inputs:"
            f" {example.inputs}.\nError: {e}",
        )
-        outputs.append({"Error": str(e)})
+        result = {"Error": str(e)}
    if callbacks and previous_example_ids:
        for example_id, tracer in zip(previous_example_ids, callbacks):
            if hasattr(tracer, "example_id"):
                tracer.example_id = example_id
-    return outputs
+    return result


 def _run_on_examples(
@@ -1075,7 +1110,7 @@ def _run_on_examples(
    Returns:
        A dictionary mapping example ids to the model outputs.
    """
-    results: Dict[str, Any] = {}
+    results: Dict[str, dict] = {}
    wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
    project_name = _get_project_name(project_name, wrapped_model)
    tracer = LangChainTracer(
@@ -1085,11 +1120,11 @@ def _run_on_examples(
        wrapped_model, examples, evaluation, data_type
    )
    examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
-    evalution_handler = EvaluatorCallbackHandler(
+    evaluation_handler = EvaluatorCallbackHandler(
        evaluators=run_evaluators or [],
        client=client,
    )
-    callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
+    callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler]
    for i, example in enumerate(examples):
        result = _run_llm_or_chain(
            example,
@@ -1100,9 +1135,14 @@ def _run_on_examples(
        )
        if verbose:
            print(f"{i+1} processed", flush=True, end="\r")
-        results[str(example.id)] = result
+        results[str(example.id)] = {"output": result}
    tracer.wait_for_futures()
-    evalution_handler.wait_for_futures()
+    evaluation_handler.wait_for_futures()
+    all_feedback = evaluation_handler.logged_feedback
+    # join the results and feedback on the example id
+    for example_id, output_dict in results.items():
+        feedback = all_feedback.get(example_id, [])
+        output_dict["feedback"] = feedback
    return results


@@ -1276,10 +1316,10 @@ async def arun_on_dataset(
        input_mapper=input_mapper,
        data_type=dataset.data_type,
    )
-    return {
-        "project_name": project_name,
-        "results": results,
-    }
+    return TestResult(
+        project_name=project_name,
+        results=results,
+    )


 def _handle_coroutine(coro: Coroutine) -> Any:
@@ -1461,7 +1501,7 @@ def run_on_dataset(
            data_type=dataset.data_type,
        )
        results = _handle_coroutine(coro)
-    return {
-        "project_name": project_name,
-        "results": results,
-    }
+    return TestResult(
+        project_name=project_name,
+        results=results,
+    )
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -182,14 +182,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
        return {"the right input": inputs["the wrong input"]}

    result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper)
-    assert len(result) == 1
-    assert result[0] == {"output": "2", "the right input": "1"}
+    assert result == {"output": "2", "the right input": "1"}
    bad_result = _run_llm_or_chain(
        example,
        lambda: mock_chain,
    )
-    assert len(bad_result) == 1
-    assert "Error" in bad_result[0]
+    assert "Error" in bad_result

    # Try with LLM
    def llm_input_mapper(inputs: dict) -> str:
@@ -197,9 +195,7 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
        return "the right input"

    mock_llm = FakeLLM(queries={"the right input": "somenumber"})
-    result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
-    assert len(result) == 1
-    llm_result = result[0]
+    llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
    assert isinstance(llm_result, str)
    assert llm_result == "somenumber"

@@ -300,8 +296,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
        tags: Optional[List[str]] = None,
        callbacks: Optional[Any] = None,
        **kwargs: Any,
-    ) -> List[Dict[str, Any]]:
-        return [{"result": f"Result for example {example.id}"}]
+    ) -> Dict[str, Any]:
+        return {"result": f"Result for example {example.id}"}

    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
        proj = mock.MagicMock()
@@ -328,9 +324,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
        )

        expected = {
-            uuid_: [
-                {"result": f"Result for example {uuid.UUID(uuid_)}"} for _ in range(1)
-            ]
+            uuid_: {
+                "output": {"result": f"Result for example {uuid.UUID(uuid_)}"},
+                "feedback": [],
+            }
            for uuid_ in uuids
        }
        assert results["results"] == expected