Add Agent Trajectory Run Evaluator

Add trajectory eval api to run evaluator path
2026-02-03 07:41:03 +00:00 · 2023-07-09 07:40:20 -07:00 · 2023-07-07 23:15:46 -07:00
10 changed files with 630 additions and 235 deletions
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -131,6 +131,11 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain):

        extra = Extra.ignore

+    @property
+    def evaluation_name(self) -> str:
+        """Return the name for the resulting evaluation result."""
+        return "agent_trajectory"
+
    @property
    def _tools_description(self) -> str:
        """Get the description of the agent tools.
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@@ -1,7 +1,6 @@
 """Loading datasets and evaluators."""
-from typing import Any, Dict, List, Optional, Sequence, Type, Union
+from typing import Any, Dict, List, Optional, Sequence, Type, Union, cast

-from langchain.chains.base import Chain
 from langchain.chat_models.openai import ChatOpenAI
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
 from langchain.evaluation.comparison import PairwiseStringEvalChain
@@ -11,7 +10,13 @@ from langchain.evaluation.embedding_distance.base import (
    PairwiseEmbeddingDistanceEvalChain,
 )
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
+from langchain.evaluation.schema import (
+    AgentTrajectoryEvaluator,
+    EvaluatorType,
+    LLMEvalChain,
+    PairwiseStringEvaluator,
+    StringEvaluator,
+)
 from langchain.evaluation.string_distance.base import (
    PairwiseStringDistanceEvalChain,
    StringDistanceEvalChain,
@@ -53,7 +58,14 @@ def load_dataset(uri: str) -> List[Dict]:
    return [d for d in dataset["train"]]


-_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
+EVALUATOR_TYPE = Union[
+    StringEvaluator, AgentTrajectoryEvaluator, PairwiseStringEvaluator
+]
+_EVALUATOR_CLS_TYPE = Union[
+    Type[StringEvaluator], Type[AgentTrajectoryEvaluator], Type[PairwiseStringEvaluator]
+]
+
+_EVALUATOR_MAP: Dict[EvaluatorType, _EVALUATOR_CLS_TYPE] = {
    EvaluatorType.QA: QAEvalChain,
    EvaluatorType.COT_QA: CotQAEvalChain,
    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
@@ -72,7 +84,7 @@ def load_evaluator(
    *,
    llm: Optional[BaseLanguageModel] = None,
    **kwargs: Any,
-) -> Chain:
+) -> EVALUATOR_TYPE:
    """Load the requested evaluation chain specified by a string.

    Parameters
@@ -102,7 +114,7 @@ def load_evaluator(
        )
    evaluator_cls = _EVALUATOR_MAP[evaluator]
    if issubclass(evaluator_cls, LLMEvalChain):
-        return evaluator_cls.from_llm(llm=llm, **kwargs)
+        return cast(EVALUATOR_TYPE, evaluator_cls.from_llm(llm=llm, **kwargs))
    else:
        return evaluator_cls(**kwargs)

@@ -113,7 +125,7 @@ def load_evaluators(
    llm: Optional[BaseLanguageModel] = None,
    config: Optional[dict] = None,
    **kwargs: Any,
-) -> List[Chain]:
+) -> List[EVALUATOR_TYPE]:
    """Load evaluators specified by a list of evaluator types.

    Parameters
--- a/langchain/evaluation/run_evaluators/agent_trajectory_run_evaluator.py
+++ b/langchain/evaluation/run_evaluators/agent_trajectory_run_evaluator.py
@@ -0,0 +1,294 @@
+"""Run evaluator wrapper for string evaluators."""
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple, TypedDict
+
+from langchainplus_sdk import EvaluationResult, RunEvaluator
+from langchainplus_sdk.schemas import Example, Run
+from pydantic import Field
+
+from langchain.agents.agent import AgentExecutor
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.run_evaluators.utilities.trajectory_utils import (
+    assemble_agent_trajectory,
+)
+from langchain.evaluation.schema import AgentTrajectoryEvaluator
+from langchain.load.serializable import Serializable
+from langchain.schema import RUN_KEY
+from langchain.schema.agent import AgentAction
+
+logger = logging.getLogger(__name__)
+
+
+class AgentInput(TypedDict):
+    """Values to pass to the trajectory evaluator."""
+
+    input: str
+    prediction: str
+    agent_trajectory: List[Tuple[AgentAction, str]]
+    reference: Optional[str]
+
+
+class AgentTrajectoryRunMapper(Serializable):
+    """Extract Agent Trajectory (and inputs and prediction) to be evaluated."""
+
+    input_key: str = "input"
+    """The key from the model Run's inputs to use as the eval input."""
+    prediction_key: str = "output"
+    """The key from the model Run's outputs to use as the eval prediction."""
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["prediction", "input", "agent_trajectory"]
+
+    @classmethod
+    def from_chain(
+        cls,
+        model: Chain,
+        input_key: Optional[str] = None,
+        prediction_key: Optional[str] = None,
+    ) -> AgentTrajectoryRunMapper:
+        """Create a RunMapper from a chain."""
+        error_messages = []
+        if input_key is None:
+            if len(model.input_keys) > 1:
+                error_messages.append(
+                    f"Chain '{model.__class__.__name__}' has multiple input"
+                    " keys. Please specify input_key as one of"
+                    f" {model.input_keys} when loading."
+                )
+            else:
+                input_key = model.input_keys[0]
+        elif input_key not in model.input_keys:
+            error_messages.append(
+                f"Chain '{model.__class__.__name__}' does not have specified"
+                f" input key: '{input_key}'."
+                f" Must be one of: {model.input_keys}"
+            )
+        if prediction_key is None:
+            if len(model.output_keys) > 1:
+                error_messages.append(
+                    f"Chain '{model.__class__.__name__}' has multiple"
+                    " output keys. Please specify prediction_key"
+                    f" as one of {model.output_keys} when loading."
+                )
+            else:
+                prediction_key = model.output_keys[0]
+        elif prediction_key not in model.output_keys:
+            error_messages.append(
+                f"Chain '{model.__class__.__name__}' does not have specified"
+                f" prediction_key: '{prediction_key}'."
+                f" Must be one of: {model.output_keys}"
+            )
+        if error_messages:
+            raise ValueError("\n".join(error_messages))
+        if input_key is None or prediction_key is None:
+            # This should never happen, but mypy doesn't know that.
+            raise ValueError(
+                f"Chain {model.__class__.__name__} has no input or output keys."
+            )
+        return cls(input_key=input_key, prediction_key=prediction_key)
+
+    def map(self, run: Run) -> AgentInput:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        if run.run_type != "chain":
+            raise ValueError(
+                f"{self.__class__.__name__} only supports agent (chain) runs."
+            )
+        if self.input_key not in run.inputs:
+            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
+        elif self.prediction_key not in run.outputs:
+            raise ValueError(
+                f"Run {run.id} does not have prediction key {self.prediction_key}."
+            )
+        else:
+            agent_trajectory = assemble_agent_trajectory(run)
+            return AgentInput(
+                input=run.inputs[self.input_key],
+                prediction=run.outputs[self.prediction_key],
+                agent_trajectory=agent_trajectory,
+                reference=None,
+            )
+
+    def __call__(self, run: Run) -> AgentInput:
+        return self.map(run)
+
+
+class AgentTrajectoryExampleMapper(Serializable):
+    """Map an example, or row in the dataset, to the inputs of an evaluation."""
+
+    reference_key: Optional[str] = None
+    """The key in the dataset example row to use as the reference answer."""
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["reference"]
+
+    def map(self, example: Example) -> Dict[str, str]:
+        """Maps the Example, or dataset row to a dictionary."""
+        if not example.outputs:
+            raise ValueError(
+                f"Example {example.id} has no outputs to use as a reference."
+            )
+        if self.reference_key is None:
+            if len(example.outputs) > 1:
+                raise ValueError(
+                    f"Example {example.id} has multiple outputs, so you must"
+                    " specify a reference_key."
+                )
+            else:
+                return list(example.outputs.values())[0]
+        elif self.reference_key not in example.outputs:
+            raise ValueError(
+                f"Example {example.id} does not have reference key"
+                f" {self.reference_key}."
+            )
+        return {"reference": example.outputs[self.reference_key]}
+
+    def __call__(self, example: Example) -> Dict[str, str]:
+        """Maps the Run and Example to a dictionary."""
+        return self.map(example)
+
+
+class AgentTrajectoryRunEvaluatorChain(Chain, RunEvaluator):
+    """Assumble the agent trajectory from a nested run and evaluate it."""
+
+    run_mapper: AgentTrajectoryRunMapper = Field(
+        default_factory=AgentTrajectoryRunMapper
+    )
+    """Maps the Run to a dictionary with 'prediction', 'input', 'agent_trajectory',
+    and optionally 'reference' strings."""
+    example_mapper: Optional[AgentTrajectoryExampleMapper] = None
+    """Maps the Example (dataset row) to a dictionary
+    with a 'reference' string."""
+    name: str = Field(default="agent_trajectory")
+    """The name of the evaluation metric."""
+    agent_trajectory_evaluator: AgentTrajectoryEvaluator
+    """The evaluation chain."""
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["run", "example"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["feedback"]
+
+    def _prepare_input(self, inputs: Dict[str, Any]) -> AgentInput:
+        run: Run = inputs["run"]
+        example: Optional[Example] = inputs.get("example")
+        evaluation_input = self.run_mapper(run)
+        if example and self.example_mapper:
+            reference_info = self.example_mapper(example)
+            evaluation_input["reference"] = reference_info["reference"]
+        elif self.agent_trajectory_evaluator.requires_reference:
+            raise ValueError(
+                f"Evaluator {self.name} requires an reference"
+                " example from the dataset,"
+                f" but none was provided for run {run.id}."
+            )
+        return evaluation_input
+
+    def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        evaluation_result = EvaluationResult(key=self.name, **output)
+        if RUN_KEY in output:
+            # TODO: Not currently surfaced. Update
+            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return evaluation_result
+
+    def _call(
+        self,
+        inputs: Dict[str, str],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluation_input = self._prepare_input(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = self.agent_trajectory_evaluator.evaluate_agent_trajectory(
+            prediction=evaluation_input["prediction"],
+            agent_trajectory=evaluation_input["agent_trajectory"],
+            input=evaluation_input["input"],
+            reference=evaluation_input.get("reference"),
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, str],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluation_input = self._prepare_input(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = await self.agent_trajectory_evaluator.aevaluate_agent_trajectory(
+            **evaluation_input,
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        return self({"run": run, "example": example})["feedback"]
+
+    async def aevaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        result = await self.acall({"run": run, "example": example})
+        return result["feedback"]
+
+    @classmethod
+    def from_model_and_evaluator(
+        cls,
+        model: Chain,
+        evaluator: AgentTrajectoryEvaluator,
+        input_key: str = "input",
+        prediction_key: str = "output",
+        reference_key: Optional[str] = None,
+    ) -> AgentTrajectoryRunEvaluatorChain:
+        """Create a AgentTrajectoryRunEvaluatorChain from a model and evaluator."""
+        if not isinstance(model, Chain):
+            raise NotImplementedError(
+                f"{cls.__name__} currently only supports"
+                " evaluating chains."
+                f"Expected AgentExecutor. Got: {type(model)}"
+            )
+        if not isinstance(model, AgentExecutor):
+            logger.warning("")
+        run_mapper = AgentTrajectoryRunMapper.from_chain(
+            model, input_key=input_key, prediction_key=prediction_key
+        )
+        if reference_key is not None:
+            example_mapper = AgentTrajectoryExampleMapper(reference_key=reference_key)
+        elif evaluator.requires_reference:
+            # We could potentially auto-infer if there is only one string in the
+            # example, but it's preferred to raise earlier.
+            raise ValueError(
+                f"Evaluator {evaluator.evaluation_name} requires a reference"
+                " example from the dataset. Please specify the reference key from"
+                " amongst the dataset outputs keys."
+            )
+        else:
+            example_mapper = None
+        return cls(
+            name=evaluator.evaluation_name,
+            run_mapper=run_mapper,
+            example_mapper=example_mapper,
+            agent_trajectory_evaluator=evaluator,
+        )
--- a/langchain/evaluation/run_evaluators/implementations.py
+++ b/langchain/evaluation/run_evaluators/implementations.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, Mapping, Optional, Sequence, Union

 from langchainplus_sdk.evaluation import EvaluationResult
-from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
+from langchainplus_sdk.schemas import Example, Run
 from pydantic import BaseModel, Field

 from langchain.chat_models.base import BaseChatModel
@@ -21,6 +21,9 @@ from langchain.evaluation.run_evaluators.base import (
    RunEvaluatorInputMapper,
    RunEvaluatorOutputParser,
 )
+from langchain.evaluation.run_evaluators.utilities.trajectory_utils import (
+    assemble_agent_trajectory,
+)
 from langchain.prompts.prompt import PromptTemplate
 from langchain.schema import BasePromptTemplate
 from langchain.schema.language_model import BaseLanguageModel
@@ -223,10 +226,8 @@ class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
    reference_output_key: Optional[str] = None
    """The key to use for selecting the reference answer."""

-    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
+    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
        """Maps the Run and Optional[Example] to a dictionary"""
-        if run.child_runs is None:
-            raise ValueError("Run must have child runs to be evaluated.")
        if run.outputs is None:
            raise ValueError("Run must have outputs to be evaluated.")
        reference = ""
@@ -241,26 +242,10 @@ class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
                raise ValueError("Could not infer the reference answer from ")

        question = run.inputs[self.agent_input_key]
-        tool_runs = [
-            run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
-        ]
-        agent_steps = []
-        for i, run_ in enumerate(tool_runs, 1):
-            tool_output = (
-                f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
-                if run_.outputs
-                else (f"Tool error: {run_.error}" if run_.error else "No output")
-            )
-            agent_steps.append(
-                f"""Step {i}:
-Tool used: {run_.name}
-Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
-Tool output: {tool_output}"""
-            )
-
+        agent_steps = assemble_agent_trajectory(run)
        return {
            "question": question,
-            "agent_trajectory": "\n\n".join(agent_steps),
+            "agent_trajectory": agent_steps,
            "answer": run.outputs[self.agent_output_key],
            "reference": reference,
        }
--- a/langchain/evaluation/run_evaluators/loading.py
+++ b/langchain/evaluation/run_evaluators/loading.py
@@ -6,10 +6,17 @@ from langchainplus_sdk import RunEvaluator
 from langchain.base_language import BaseLanguageModel
 from langchain.chains.base import Chain
 from langchain.evaluation.loading import load_evaluator
+from langchain.evaluation.run_evaluators.agent_trajectory_run_evaluator import (
+    AgentTrajectoryRunEvaluatorChain,
+)
 from langchain.evaluation.run_evaluators.string_run_evaluator import (
    StringRunEvaluatorChain,
 )
-from langchain.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain.evaluation.schema import (
+    AgentTrajectoryEvaluator,
+    EvaluatorType,
+    StringEvaluator,
+)
 from langchain.tools.base import Tool


@@ -22,7 +29,7 @@ def load_run_evaluator_for_model(
    reference_key: Optional[str] = None,
    eval_llm: Optional[BaseLanguageModel] = None,
    **kwargs: Any,
-) -> List[RunEvaluator]:
+) -> RunEvaluator:
    """Load evaluators specified by a list of evaluator types.

    Parameters
@@ -57,6 +64,16 @@ def load_run_evaluator_for_model(
            prediction_key=prediction_key,
            reference_key=reference_key,
        )
+    elif isinstance(evaluator_, AgentTrajectoryEvaluator):
+        if not isinstance(model, Chain):
+            raise ValueError(
+                "AgentTrajectoryRunEvaluator evaluates agent chains rather than LLMs"
+                " or other components directly. Expected model of type 'chain'."
+                f" Got '{type(model)}'."
+            )
+        run_evaluator = AgentTrajectoryRunEvaluatorChain.from_model_and_evaluator(
+            model, evaluator_
+        )
    else:
        raise NotImplementedError(f"Run evaluator for {evaluator} is not implemented")
    return run_evaluator
--- a/langchain/evaluation/run_evaluators/utilities/init.py
+++ b/langchain/evaluation/run_evaluators/utilities/init.py
--- a/langchain/evaluation/run_evaluators/utilities/trajectory_utils.py
+++ b/langchain/evaluation/run_evaluators/utilities/trajectory_utils.py
@@ -0,0 +1,26 @@
+from typing import List, Tuple
+
+from langchainplus_sdk.schemas import Run, RunTypeEnum
+
+from langchain.schema.agent import AgentAction
+
+
+def assemble_agent_trajectory(
+    run: Run, *, tool_input_key: str = "input", tool_output_key: str = "output"
+) -> List[Tuple[AgentAction, str]]:
+    """Extract the series of steps from a run."""
+    if run.child_runs is None:
+        raise ValueError("Run must have child runs to be evaluated.")
+    tool_runs = [run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool]
+    agent_steps = []
+    for run_ in tool_runs:
+        tool_output = run_.outputs[tool_output_key] if run_.outputs else run_.error
+        agent_steps.append(
+            (
+                AgentAction(
+                    tool=run_.name, tool_input=run_.inputs[tool_input_key], log=""
+                ),
+                tool_output,
+            )
+        )
+    return agent_steps
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -340,6 +340,10 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
 class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
    """Interface for evaluating agent trajectories."""

+    @property
+    def evaluation_name(self) -> str:
+        raise NotImplementedError()
+
    @property
    def requires_input(self) -> bool:
        return True
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@@ -9,13 +9,13 @@
   "source": [
    "# Debug, Evaluate, and Monitor LLMs with LangSmith\n",
    "\n",
-    "LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
+    "LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. You will have to customize prompts, data, llms, and more to make it higher quality. To aid the development process, we've designed call tracing at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
    "\n",
-    "When might you want to use tracing? Some situations we've found it useful include:\n",
+    "When might you want to use run tracing? Some situations we've found it useful include:\n",
+    "- Comparing prompts\n",
+    "- Comparing LLMs\n",
    "- Quickly debugging a new chain, agent, or set of tools\n",
-    "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
-    "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
-    "- Capturing production traces and using LangChain summarizers to analyze app usage"
+    "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar."
   ]
  },
  {
@@ -26,7 +26,7 @@
    "## Prerequisites\n",
    "\n",
    "**Either [create a hosted LangSmith account](https://www.langchain.plus/) and connect with an API key OR\n",
-    "run the server locally.**\n",
+    "[run the server locally](https://docs.smith.langchain.com/docs/additional-resources/local_installation).**\n",
    "\n",
    "\n",
    "To run the local server, execute the following comand in your terminal:\n",
@@ -35,7 +35,7 @@
    "langchain plus start\n",
    "```\n",
    "\n",
-    "Now, let's get started by creating a client to connect to LangChain+."
+    "Now, let's get started by configuring our environment."
   ]
  },
  {
@@ -54,12 +54,12 @@
    "\n",
    "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
    "\n",
-    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
+    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
   "metadata": {
    "tags": []
@@ -78,10 +78,10 @@
       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+       "LangChainPlusClient (API URL: https://dev.api.smith.langchain.com)"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -94,8 +94,8 @@
    "unique_id = uuid4().hex[0:8]\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
-    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line to use the hosted version\n",
-    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
+    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"  # Uncomment this line to use the hosted version\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGSMITH-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
    "\n",
    "# Used by the agent below\n",
    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
@@ -116,7 +116,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "7c801853-8e96-404d-984c-51ace59cbbef",
   "metadata": {
    "tags": []
@@ -136,7 +136,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "id": "19537902-b95c-4390-80a4-f6c9a937081e",
   "metadata": {
    "tags": []
@@ -175,7 +175,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "id": "0405ff30-21fe-413d-85cf-9fa3c649efec",
   "metadata": {
    "tags": []
@@ -198,7 +198,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "id": "b7bc3934-bb1a-452c-a723-f9cdb0b416f9",
   "metadata": {
    "tags": []
@@ -210,10 +210,10 @@
       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+       "LangChainPlusClient (API URL: https://dev.api.smith.langchain.com)"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -248,32 +248,20 @@
    "\n",
    "Below, use the client to create a dataset from the Agent runs you just logged while debugging above. You will use these later to measure performance.\n",
    "\n",
-    "For more information on datasets, including how to create them from CSVs or other files or how to create them in the web app, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs)."
+    "For more information on datasets, including how to create them from CSVs or other files or how to create them in the web app, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/docs)."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "dataset_name = \"calculator-example-dataset\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "    client.delete_dataset(dataset_name=dataset_name)\n",
+    "dataset_name = f\"calculator-example-dataset-{unique_id}\"\n",
+    "\n",
    "dataset = client.create_dataset(\n",
    "    dataset_name, description=\"A calculator example dataset\"\n",
    ")\n",
@@ -287,78 +275,6 @@
    "    client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
-   "metadata": {},
-   "source": [
-    "### 2. Select RunEvaluators\n",
-    "\n",
-    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
-    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
-    "\n",
-    "Below, we will create some pre-implemented run evaluators that do the following:\n",
-    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
-    "- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
-    "- Evaluating 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
-    "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
-    "\n",
-    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
-    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
-    "\n",
-    "Below, create the run evaluators.\n",
-    "\n",
-    "**Note: the feedback API is currently experimental and subject to change.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.run_evaluators import (\n",
-    "    get_qa_evaluator,\n",
-    "    get_criteria_evaluator,\n",
-    "    get_trajectory_evaluator,\n",
-    ")\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "\n",
-    "# You can use any model, but stronger llms tend to be more reliable\n",
-    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
-    "\n",
-    "# Measures accuracy against ground truth\n",
-    "qa_evaluator = get_qa_evaluator(eval_llm)\n",
-    "\n",
-    "# Measures how effective and efficient the agent's actions are\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-    "trajectory_evaluator = get_trajectory_evaluator(eval_llm, agent_tools=tools)\n",
-    "\n",
-    "# Measure helpfulness. We have some pre-defined criteria you can select\n",
-    "helpfulness_evaluator = get_criteria_evaluator(\n",
-    "    eval_llm,\n",
-    "    \"helpfulness\",\n",
-    ")\n",
-    "\n",
-    "# Custom criteria are specified as a dictionary\n",
-    "custom_criteria_evaluator = get_criteria_evaluator(\n",
-    "    eval_llm,\n",
-    "    {\n",
-    "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
-    "    },\n",
-    ")\n",
-    "\n",
-    "evaluators = [\n",
-    "    qa_evaluator,\n",
-    "    trajectory_evaluator,\n",
-    "    helpfulness_evaluator,\n",
-    "    custom_criteria_evaluator,\n",
-    "]"
-   ]
-  },
  {
   "cell_type": "markdown",
   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
@@ -366,9 +282,9 @@
    "tags": []
   },
   "source": [
-    "### 3. Define the Agent or LLM to Test\n",
+    "### 2. Define the Agent or LLM to Test\n",
    "\n",
-    "You can evaluate any LLM or chain. Since chains can have memory, we need to pass an\n",
+    "You can evaluate any chain, agent, or LLM. Since chains can have memory, we need to pass an\n",
    "initializer function that returns a new chain for each row.\n",
    "\n",
    "In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
@@ -376,7 +292,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
   "metadata": {
    "tags": []
@@ -384,8 +300,7 @@
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent, load_tools\n",
-    "from langchain.agents import AgentType\n",
+    "from langchain.agents import initialize_agent, load_tools, AgentType\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@@ -403,6 +318,81 @@
    "# chain_factory = lambda: agent"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
+   "metadata": {},
+   "source": [
+    "### 3. Select RunEvaluators\n",
+    "\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
+    "\n",
+    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
+    "- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
+    "- Use criteria to evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
+    "\n",
+    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
+    "\n",
+    "Below, create the run evaluators.\n",
+    "\n",
+    "**Note: the feedback API is currently experimental and subject to change.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation import EvaluatorType\n",
+    "from langchain.evaluation.run_evaluators import (\n",
+    "    load_run_evaluators_for_model,\n",
+    "    load_run_evaluator_for_model,\n",
+    ")\n",
+    "\n",
+    "model_to_evaluate = agent_factory()\n",
+    "\n",
+    "# The QA 'correctness' evaluator requires a reference label from the dataset examples.\n",
+    "qa_evaluator = load_run_evaluator_for_model(EvaluatorType.QA, model_to_evaluate, reference_key=\"output\")\n",
+    "\n",
+    "# Measure helpfulness. We have some pre-defined criteria you can select\n",
+    "helpfulness_evaluator = load_run_evaluator_for_model(EvaluatorType.CRITERIA, model_to_evaluate, criteria=\"helpfulness\")\n",
+    "\n",
+    "# Custom criteria are specified as a dictionary\n",
+    "custom_criteria_evaluator = load_run_evaluator_for_model(\n",
+    "    EvaluatorType.CRITERIA,\n",
+    "    model_to_evaluate,\n",
+    "    criteria={\n",
+    "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
+    "    })\n",
+    "\n",
+    "# You can also use embedding distance or other non-LLM evaluators\n",
+    "embedding_evaluator = load_run_evaluator_for_model(\n",
+    "    EvaluatorType.EMBEDDING_DISTANCE,\n",
+    "    model_to_evaluate,\n",
+    "    reference_key=\"output\",\n",
+    ")\n",
+    "\n",
+    "# Measures how effective and efficient the agent's actions are\n",
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "trajectory_evaluator = load_run_evaluator_for_model(EvaluatorType.AGENT_TRAJECTORY, model_to_evaluate, agent_tools=tools)\n",
+    "\n",
+    "\n",
+    "evaluators = [\n",
+    "    qa_evaluator,\n",
+    "    trajectory_evaluator,\n",
+    "    embedding_evaluator,\n",
+    "    helpfulness_evaluator,\n",
+    "    custom_criteria_evaluator,\n",
+    "]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "07885b10",
@@ -419,81 +409,78 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
   "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "\u001b[0;31mSignature:\u001b[0m\n",
-       "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m    \u001b[0mrun_evaluators\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Sequence[RunEvaluator]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-       "\u001b[0;31mDocstring:\u001b[0m\n",
-       "Asynchronously run the Chain or language model on a dataset\n",
-       "and store traces to the specified project name.\n",
-       "\n",
-       "Args:\n",
-       "    dataset_name: Name of the dataset to run the chain on.\n",
-       "    llm_or_chain_factory: Language model or Chain constructor to run\n",
-       "        over the dataset. The Chain constructor is used to permit\n",
-       "        independent calls on each example without carrying over state.\n",
-       "    concurrency_level: The number of async tasks to run concurrently.\n",
-       "    num_repetitions: Number of times to run the model on each example.\n",
-       "        This is useful when testing success rates or generating confidence\n",
-       "        intervals.\n",
-       "    project_name: Name of the project to store the traces in.\n",
-       "        Defaults to {dataset_name}-{chain class name}-{datetime}.\n",
-       "    verbose: Whether to print progress.\n",
-       "    client: Client to use to read the dataset. If not provided, a new\n",
-       "        client will be created using the credentials in the environment.\n",
-       "    tags: Tags to add to each run in the project.\n",
-       "    run_evaluators: Evaluators to run on the results of the chain.\n",
-       "\n",
-       "Returns:\n",
-       "    A dictionary containing the run's project name and the resulting model outputs.\n",
-       "\u001b[0;31mFile:\u001b[0m      ~/code/lc/lckg/langchain/client/runner_utils.py\n",
-       "\u001b[0;31mType:\u001b[0m      function"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from langchain.client import (\n",
    "    arun_on_dataset,\n",
    "    run_on_dataset,  # Available if your chain doesn't support async calls.\n",
-    ")\n",
-    "\n",
-    "?arun_on_dataset"
+    ")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
   "metadata": {
    "tags": []
   },
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example bbcff65a-f9d6-4b4a-9e7b-f3e626387411. Error: LLMMathChain._evaluate(\"\n",
+      "age_of_Dua_Lipa_boyfriend ** 0.43\n",
+      "\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n"
+     ]
+    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processed examples: 6\r"
+      "Processed examples: 2\r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/wfh/code/lc/lckg/langchain/callbacks/manager.py:167: UserWarning: The tracing v2 API is in development. This is not yet stable and may change in the future.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 5\r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example 5ca0058d-c7ec-4059-9894-0e200474c9dc. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 9\r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/wfh/code/lc/lckg/langchain/evaluation/schema.py:88: UserWarning: Ignoring input in EmbeddingDistanceEvalChain, as it is not expected.\n",
+      "  warn(self._skip_input_warning)\n"
     ]
    }
   ],
@@ -531,26 +518,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "id": "136db492-d6ca-4215-96f9-439c23538241",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-      ],
-      "text/plain": [
-       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# You can navigate to the UI by clicking on the link below\n",
    "client"
@@ -565,7 +538,7 @@
   "source": [
    "For a real production application, you will want to add many more test cases and\n",
    "incorporate larger datasets to run benchmark evaluations to measure aggregate performance\n",
-    "across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.langchain.plus/docs/)"
+    "across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.smith.langchain.com/docs/)"
   ]
  },
  {
@@ -584,7 +557,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "id": "3718710f-f719-4861-a351-0bb9d639d9fd",
   "metadata": {
    "tags": []
@@ -592,7 +565,7 @@
   "outputs": [],
   "source": [
    "deployment_name = f\"Search + Calculator Deployment - {unique_id}\"\n",
-    "project = client.create_project(deployment_name, mode=\"monitor\")"
+    "project = client.create_project(deployment_name)"
   ]
  },
  {
@@ -607,7 +580,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "id": "56dba20a-c07c-4b18-a4e7-834ab6dc87ef",
   "metadata": {
    "tags": []
@@ -619,22 +592,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "id": "569389d4-b613-47ce-99d3-e0031f308185",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LLMMathChain._evaluate(\"\n",
-      "US_GDP / average_lifespan\n",
-      "\") raised error: 'US_GDP'. Please try again with a valid numerical expression\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "os.environ[\"LANGCHAIN_PROJECT\"] = deployment_name\n",
    "\n",
@@ -659,11 +622,11 @@
   "source": [
    "## Conclusion\n",
    "\n",
-    "Congratulations! You have succesfully created connected an agent to LangSmith to trace and debug, evaluated it for accuracy, helpfulness, and trajectory efficiency over a dataset, and instrumented a monitoring project for a simulated \"production\" application!\n",
+    "Congratulations! You have succesfully connected an agent to LangSmith for debugging, graded it using multiple evaluators across a dataset, and created a new project for monitoring your agent in a simulated \"production\" application!\n",
    "\n",
    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better products.\n",
    "\n",
-    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/),\n",
+    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.smith.langchain.com/docs/),\n",
    "\n",
    "and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
   ]
--- a/tests/unit_tests/evaluation/run_evaluators/test_loading.py
+++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
@@ -4,12 +4,18 @@ from unittest.mock import MagicMock

 import pytest

+from langchain.agents.initialize import initialize_agent
 from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
-from langchain.evaluation.loading import load_evaluators
+from langchain.evaluation.loading import load_evaluator, load_evaluators
+from langchain.evaluation.run_evaluators.agent_trajectory_run_evaluator import (
+    AgentTrajectoryRunEvaluatorChain,
+)
+from langchain.evaluation.run_evaluators.loading import load_run_evaluator_for_model
 from langchain.evaluation.run_evaluators.string_run_evaluator import (
    StringRunEvaluatorChain,
 )
-from langchain.evaluation.schema import StringEvaluator
+from langchain.evaluation.schema import AgentTrajectoryEvaluator, StringEvaluator
+from langchain.tools.base import tool
 from tests.unit_tests.chains.test_base import FakeChain
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
 from tests.unit_tests.llms.fake_llm import FakeLLM
@@ -81,10 +87,9 @@ def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
        the_input_keys=["an_input", "another_input"],
    )
    fake_llm = FakeChatModel()
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    evaluator = load_evaluator(evaluator_type, llm=fake_llm)  # type: ignore
    if not isinstance(evaluator, StringEvaluator):
        raise ValueError("Evaluator is not a string evaluator")
-    # No input key
    with pytest.raises(ValueError, match="multiple input keys"):
        StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
    with pytest.raises(ValueError, match="does not have specified"):
@@ -112,3 +117,87 @@ def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
    if evaluator.requires_reference:
        assert "reference" in result
        assert result["reference"] == "Another fake response"
+
+
+@pytest.mark.parametrize("requires_reference", [False, True])
+@pytest.mark.parametrize("evaluator_type", ["trajectory"])
+def test_load_agent_trajectory_evaluator_with_chain(
+    evaluator_type: str, requires_reference: bool
+) -> None:
+    model = FakeChain(
+        the_input_keys=["some_input", "another_input"],
+    )
+    fake_llm = FakeChatModel()
+    evaluator = load_evaluators(
+        [evaluator_type],  # type: ignore
+        llm=fake_llm,
+        requires_reference=requires_reference,
+    )[0]
+    if not isinstance(evaluator, AgentTrajectoryEvaluator):
+        raise ValueError("Evaluator is not an agent trajectory evaluator")
+    with pytest.raises(ValueError, match="does not have specified prediction_key"):
+        AgentTrajectoryRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
+    with pytest.raises(ValueError, match="does not have specified input key: 'input'"):
+        AgentTrajectoryRunEvaluatorChain.from_model_and_evaluator(
+            model, evaluator, prediction_key="bar"
+        )
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "label_column"
+    run_evaluator = AgentTrajectoryRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, input_key="some_input", prediction_key="bar", **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model(
+        {"some_input": "Foo input", "another_input": "Another fake response"},
+        callbacks=[callback],
+    )
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"label_column": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Foo input"
+    assert result["prediction"] == "baz"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
+
+
+@pytest.mark.parametrize("requires_reference", [False, True])
+def test_load_agent_trajectory_evaluator_with_agent_executor(
+    requires_reference: bool,
+) -> None:
+    fake_eval_llm = FakeChatModel()
+
+    @tool
+    def fake_tool(txt: str) -> str:
+        """Wants to be real."""
+        return txt
+
+    fake_llm = FakeLLM(queries={"foo": "Final Answer: pi"}, sequential_responses=True)
+    agent_executor = initialize_agent(
+        tools=[fake_tool], llm=fake_llm  # type: ignore[list-item]
+    )
+    run_evaluator = load_run_evaluator_for_model(
+        "trajectory",  # type: ignore[arg-type]
+        agent_executor,
+        eval_llm=fake_eval_llm,
+        requires_reference=requires_reference,
+    )
+    assert isinstance(run_evaluator, AgentTrajectoryRunEvaluatorChain)
+    callback = RunCollectorCallbackHandler()
+    agent_executor(
+        {"input": "This is the input"},
+        callbacks=[callback],
+    )
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"label_column": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "This is the input"
+    assert result["prediction"] == "pi"
+    if run_evaluator.agent_trajectory_evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
Author	SHA1	Message	Date
William Fu-Hinthorn	dcb861ecec	Add Agent Trajectory Run Evaluator	2023-07-09 07:40:20 -07:00
William Fu-Hinthorn	745e48b4e9	Add trajectory eval api to run evaluator path	2023-07-07 23:15:46 -07:00