Add Agent Trajectory Interface (#7122)

2025-08-08 20:41:52 +00:00 · 2023-07-06 13:33:33 -07:00 · 2023-07-06 13:33:33 -07:00 · 1f4a51cb9c
commit 1f4a51cb9c
parent a6b39afe0e
3 changed files with 143 additions and 14 deletions
--- a/langchain/evaluation/init.py
+++ b/langchain/evaluation/init.py
@ -21,7 +21,11 @@ from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChai
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+from langchain.evaluation.schema import (
    AgentTrajectoryEvaluator,
    PairwiseStringEvaluator,
    StringEvaluator,
 )
 __all__ = [
    "PairwiseStringEvalChain",
@ -32,4 +36,5 @@ __all__ = [
    "PairwiseStringEvaluator",
    "TrajectoryEvalChain",
    "CriteriaEvalChain",
    "AgentTrajectoryEvaluator",
 ]
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import (
    EVAL_CHAT_PROMPT,
    TOOL_FREE_EVAL_CHAT_PROMPT,
 )
 from langchain.evaluation.schema import AgentTrajectoryEvaluator
 from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
 from langchain.tools.base import BaseTool
@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser):
        return TrajectoryEval(score=int(score_str), reasoning=reasoning)
-class TrajectoryEvalChain(Chain):
+class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain):
    """A chain for evaluating ReAct style agents.
    This chain is used to evaluate ReAct style agents by reasoning about
@ -142,7 +143,9 @@ Description: {tool.description}"""
        )
    @staticmethod
-    def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
+    def get_agent_trajectory(
        steps: Union[str, Sequence[Tuple[AgentAction, str]]]
    ) -> str:
        """Get the agent trajectory as a formatted string.
        Args:
@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness:
        return {"score": parsed_output.score}
-    def evaluate_agent_trajectory(
+    def _evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness:
        """Evaluate a trajectory.
        Args:
-            input (str): The input question.
+            prediction (str): The final predicted response.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            input (str): The input to the agent.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
            callbacks (Callbacks): Callbacks to use for this chain run.
        Returns:
            dict: The evaluation result.
@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness:
        }
        return self(inputs=inputs, callbacks=callbacks, **kwargs)
-    async def aevaluate_agent_trajectory(
+    async def _aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness:
        """Asynchronously evaluate a trajectory.
        Args:
-            input (str): The input question.
+            prediction (str): The final predicted response.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            input (str): The input to the agent.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
            callbacks (Callbacks): Callbacks to use for this chain run.
        Returns:
            dict: The evaluation result.
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@ -3,9 +3,11 @@ from __future__ import annotations
 import logging
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, Sequence, Tuple
 from warnings import warn
 from langchain.schema.agent import AgentAction
 logger = logging.getLogger(__name__)
@ -275,3 +277,120 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
            input=input,
            **kwargs,
        )
 class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
    """Interface for evaluating agent trajectories."""
    @property
    def requires_input(self) -> bool:
        return True
    @abstractmethod
    def _evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """Evaluate a trajectory.
        Args:
            prediction (str): The final predicted response.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            input (str): The input to the agent.
            reference (Optional[str]): The reference answer.
        Returns:
            dict: The evaluation result.
        """
    async def _aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate a trajectory.
        Args:
            prediction (str): The final predicted response.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            input (str): The input to the agent.
            reference (Optional[str]): The reference answer.
        Returns:
            dict: The evaluation result.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} hasn't implemented an async "
            "aevaluate_agent_trajectory method."
        )
    def evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """Evaluate a trajectory.
        Args:
            prediction (str): The final predicted response.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            input (str): The input to the agent.
            reference (Optional[str]): The reference answer.
        Returns:
            dict: The evaluation result.
        """
        self._check_evaluation_args(reference=reference, input=input)
        return self._evaluate_agent_trajectory(
            prediction=prediction,
            input=input,
            agent_trajectory=agent_trajectory,
            reference=reference,
            **kwargs,
        )
    async def aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        input: str,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """Asynchronously evaluate a trajectory.
        Args:
            prediction (str): The final predicted response.
            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
            input (str): The input to the agent.
            reference (Optional[str]): The reference answer.
        Returns:
            dict: The evaluation result.
        """
        self._check_evaluation_args(reference=reference, input=input)
        return await self._aevaluate_agent_trajectory(
            prediction=prediction,
            input=input,
            agent_trajectory=agent_trajectory,
            reference=reference,
            **kwargs,
        )