mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 03:01:29 +00:00
Evaluation Callback Multi Response (#12505)
1. Allow run evaluators to return {"results": [list of evaluation results]} in the evaluator callback. 2. Allows run evaluators to pick the target run ID to provide feedback to (1) means you could do something like a function call that populates a full rubric in one go (not sure how reliable that is in general though) rather than splitting off into separate LLM calls - cheaper and less code to write (2) means you can provide feedback to runs on subsequent calls. Immediate use case is if you wanted to add an evaluator to a chat bot and assign to assign to previous conversation turns have a corresponding one in the SDK
This commit is contained in:
parent
9e0ae56287
commit
36204c2baf
@ -114,6 +114,7 @@ class EvaluatorCallbackHandler(BaseTracer):
|
|||||||
try:
|
try:
|
||||||
if self.project_name is None:
|
if self.project_name is None:
|
||||||
eval_result = self.client.evaluate_run(run, evaluator)
|
eval_result = self.client.evaluate_run(run, evaluator)
|
||||||
|
eval_results = [eval_result]
|
||||||
with manager.tracing_v2_enabled(
|
with manager.tracing_v2_enabled(
|
||||||
project_name=self.project_name, tags=["eval"], client=self.client
|
project_name=self.project_name, tags=["eval"], client=self.client
|
||||||
) as cb:
|
) as cb:
|
||||||
@ -126,17 +127,10 @@ class EvaluatorCallbackHandler(BaseTracer):
|
|||||||
run,
|
run,
|
||||||
example=reference_example,
|
example=reference_example,
|
||||||
)
|
)
|
||||||
run_id = cb.latest_run.id if cb.latest_run is not None else None
|
eval_results = self._log_evaluation_feedback(
|
||||||
self.client.create_feedback(
|
evaluation_result,
|
||||||
run.id,
|
run,
|
||||||
evaluation_result.key,
|
source_run_id=cb.latest_run.id if cb.latest_run else None,
|
||||||
score=evaluation_result.score,
|
|
||||||
value=evaluation_result.value,
|
|
||||||
comment=evaluation_result.comment,
|
|
||||||
correction=evaluation_result.correction,
|
|
||||||
source_info=evaluation_result.evaluator_info,
|
|
||||||
source_run_id=evaluation_result.source_run_id or run_id,
|
|
||||||
feedback_source_type=langsmith.schemas.FeedbackSourceType.MODEL,
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
@ -147,9 +141,59 @@ class EvaluatorCallbackHandler(BaseTracer):
|
|||||||
raise e
|
raise e
|
||||||
example_id = str(run.reference_example_id)
|
example_id = str(run.reference_example_id)
|
||||||
with self.lock:
|
with self.lock:
|
||||||
self.logged_eval_results.setdefault((str(run.id), example_id), []).append(
|
for res in eval_results:
|
||||||
eval_result
|
run_id = (
|
||||||
|
str(getattr(res, "target_run_id"))
|
||||||
|
if hasattr(res, "target_run_id")
|
||||||
|
else str(run.id)
|
||||||
|
)
|
||||||
|
self.logged_eval_results.setdefault((run_id, example_id), []).append(
|
||||||
|
res
|
||||||
|
)
|
||||||
|
|
||||||
|
def _select_eval_results(
|
||||||
|
self,
|
||||||
|
results: Union[EvaluationResult, dict],
|
||||||
|
) -> List[EvaluationResult]:
|
||||||
|
if isinstance(results, EvaluationResult):
|
||||||
|
results_ = [results]
|
||||||
|
elif isinstance(results, dict) and "results" in results:
|
||||||
|
results_ = cast(List[EvaluationResult], results["results"])
|
||||||
|
else:
|
||||||
|
raise TypeError(
|
||||||
|
f"Invalid evaluation result type {type(results)}."
|
||||||
|
" Expected EvaluationResult or EvaluationResults."
|
||||||
)
|
)
|
||||||
|
return results_
|
||||||
|
|
||||||
|
def _log_evaluation_feedback(
|
||||||
|
self,
|
||||||
|
evaluator_response: Union[EvaluationResult, dict],
|
||||||
|
run: Run,
|
||||||
|
source_run_id: Optional[UUID] = None,
|
||||||
|
) -> List[EvaluationResult]:
|
||||||
|
results = self._select_eval_results(evaluator_response)
|
||||||
|
for res in results:
|
||||||
|
source_info_: Dict[str, Any] = {}
|
||||||
|
if res.evaluator_info:
|
||||||
|
source_info_ = {**res.evaluator_info, **source_info_}
|
||||||
|
run_id_ = (
|
||||||
|
getattr(res, "target_run_id")
|
||||||
|
if hasattr(res, "target_run_id") and res.target_run_id is not None
|
||||||
|
else run.id
|
||||||
|
)
|
||||||
|
self.client.create_feedback(
|
||||||
|
run_id_,
|
||||||
|
res.key,
|
||||||
|
score=res.score,
|
||||||
|
value=res.value,
|
||||||
|
comment=res.comment,
|
||||||
|
correction=res.correction,
|
||||||
|
source_info=source_info_,
|
||||||
|
source_run_id=res.source_run_id or source_run_id,
|
||||||
|
feedback_source_type=langsmith.schemas.FeedbackSourceType.MODEL,
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
def _persist_run(self, run: Run) -> None:
|
def _persist_run(self, run: Run) -> None:
|
||||||
"""Run the evaluator on the run.
|
"""Run the evaluator on the run.
|
||||||
|
Loading…
Reference in New Issue
Block a user