Include run_id (#14331)

in the test run outputs
2025-08-14 23:26:34 +00:00 · 2023-12-06 14:07:45 -08:00 · 2023-12-06 14:07:45 -08:00 · e5bd32ff6d
commit e5bd32ff6d
parent cc76f0e834
2 changed files with 6 additions and 0 deletions
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@ -135,6 +135,7 @@ class TestResult(dict):
                    **{f"feedback.{f.key}": f.score for f in feedback},
                    "error": result.get("Error"),
                    "execution_time": result["execution_time"],
+                    "run_id": result.get("run_id"),
                }
            )
            records.append(r)
@ -1018,6 +1019,7 @@ def _collect_test_results(
    wait_for_all_evaluators()
    all_eval_results = {}
    all_execution_time = {}
+    all_run_ids = {}
    for c in configs:
        for callback in cast(list, c["callbacks"]):
            if isinstance(callback, EvaluatorCallbackHandler):
@ -1028,12 +1030,14 @@ def _collect_test_results(
            elif isinstance(callback, LangChainTracer):
                run = callback.latest_run
                example_id = callback.example_id
+                run_id = str(run.id) if run else None
                execution_time = (
                    (run.end_time - run.start_time).total_seconds()
                    if run and run.end_time
                    else None
                )
                all_execution_time[str(example_id)] = execution_time
+                all_run_ids[str(example_id)] = run_id

    results: dict = {}
    for example, output in zip(examples, batch_results):
@ -1042,6 +1046,7 @@ def _collect_test_results(
            "input": example.inputs,
            "feedback": feedback,
            "execution_time": all_execution_time.get(str(example.id)),
+            "run_id": all_run_ids.get(str(example.id)),
        }
        if isinstance(output, EvalError):
            results[str(example.id)]["Error"] = output.Error
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@ -345,6 +345,7 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
                "feedback": [],
                # No run since we mock the call to the llm above
                "execution_time": None,
+                "run_id": None,
            }
            for example in examples
        }