Add fuzzy and embedding metrics

2026-02-06 17:20:16 +00:00 · 2023-07-03 23:18:07 -07:00
21 changed files with 468 additions and 1180 deletions
--- a/langchain/evaluation/init.py
+++ b/langchain/evaluation/init.py
@@ -1,41 +1,29 @@
-"""Evaluation chains for grading LLM and Chain outputs.
+"""Functionality relating to evaluation.

-This module contains off-the-shelf evaluation chains for grading the output of
-LangChain primitives such as language models and chains.
-
-To load an evaluator, you can use the :func:`load_evaluators <langchain.evaluation.loading.load_evaluators>` function with the
-name of the evaluator to load.
-
-To load one of the LangChain HuggingFace datasets, you can use the :func:`load_dataset <langchain.evaluation.loading.load_dataset>` function with the
-name of the dataset to load.
+This module contains off-the-shelf evaluation chains for
+grading the output of LangChain primitives such as LLMs and Chains.

 Some common use cases for evaluation include:

- Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain <langchain.evaluation.qa.eval_chain.QAEvalChain>`
- Comparing the output of two models: :class:`PairwiseStringEvalChain <langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain>`
- Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain <langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain>`
- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain <langchain.evaluation.criteria.eval_chain.CriteriaEvalChain>`
+- Grading accuracy of a response against ground truth answers: QAEvalChain
+- Comparing the output of two models: PairwiseStringEvalChain
+- Judging the efficacy of an agent's tool usage: TrajectoryEvalChain
+- Checking whether an output complies with a set of criteria: CriteriaEvalChain

-This module also contains low-level APIs for creating custom evaluators for
-specific evaluation tasks. These include:
+This module also contains low level APIs for making more evaluators for your
+custom evaluation task. These include:
+- StringEvaluator: Evaluates an output string against a reference and/or
+    with input context.
+- PairwiseStringEvaluator: Evaluates two strings against each other.
+"""

- :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`: Evaluates an output string against a reference and/or input context.
- :class:`PairwiseStringEvaluator <langchain.evaluation.schema.PairwiseStringEvaluator>`: Evaluates two strings against each other.
-
-"""  # noqa: E501
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
-from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
+from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
-from langchain.evaluation.loading import load_dataset, load_evaluators
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import (
-    EvaluatorType,
-    PairwiseStringEvaluator,
-    StringEvaluator,
-)
+from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator

 __all__ = [
-    "EvaluatorType",
    "PairwiseStringEvalChain",
    "QAEvalChain",
    "CotQAEvalChain",
@@ -44,6 +32,4 @@ __all__ = [
    "PairwiseStringEvaluator",
    "TrajectoryEvalChain",
    "CriteriaEvalChain",
-    "load_evaluators",
-    "load_dataset",
 ]
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -7,21 +7,20 @@ chain (LLMChain) to generate the reasoning and scores.

 from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

-from pydantic import Extra, Field
+from pydantic import Field

-from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
    Callbacks,
 )
+from langchain.chains.base import Chain
 from langchain.chains.llm import LLMChain
 from langchain.chat_models.base import BaseChatModel
 from langchain.evaluation.agents.trajectory_eval_prompt import (
    EVAL_CHAT_PROMPT,
    TOOL_FREE_EVAL_CHAT_PROMPT,
 )
-from langchain.evaluation.schema import EvalChain
 from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
 from langchain.tools.base import BaseTool

@@ -70,7 +69,7 @@ class TrajectoryOutputParser(BaseOutputParser):
        return TrajectoryEval(score=int(score_str), reasoning=reasoning)


-class TrajectoryEvalChain(EvalChain):
+class TrajectoryEvalChain(Chain):
    """A chain for evaluating ReAct style agents.

    This chain is used to evaluate ReAct style agents by reasoning about
@@ -124,11 +123,6 @@ class TrajectoryEvalChain(EvalChain):
    return_reasoning: bool = False
    """Whether to return the reasoning along with the score."""

-    class Config:
-        """Configuration for the QAEvalChain."""
-
-        extra = Extra.ignore
-
    @property
    def _tools_description(self) -> str:
        """Get the description of the agent tools.
@@ -192,11 +186,10 @@ The following is the expected answer. Use this to measure correctness:
    @classmethod
    def from_llm(
        cls,
-        llm: BaseLanguageModel,
+        llm: BaseChatModel,
        agent_tools: Optional[Sequence[BaseTool]] = None,
        output_parser: Optional[TrajectoryOutputParser] = None,
        return_reasoning: bool = False,
-        **kwargs: Any,
    ) -> "TrajectoryEvalChain":
        """Create a TrajectoryEvalChain object from a language model chain.

@@ -212,10 +205,6 @@ The following is the expected answer. Use this to measure correctness:
        Returns:
            TrajectoryEvalChain: The TrajectoryEvalChain object.
        """
-        if not isinstance(llm, BaseChatModel):
-            raise NotImplementedError(
-                "Only chat models supported by the current trajectory eval"
-            )
        if agent_tools:
            prompt = EVAL_CHAT_PROMPT
        else:
@@ -226,7 +215,6 @@ The following is the expected answer. Use this to measure correctness:
            return_reasoning=return_reasoning,
            eval_chain=eval_chain,
            output_parser=output_parser or TrajectoryOutputParser(),
-            **kwargs,
        )

    @property
--- a/langchain/evaluation/comparison/embedding.py
+++ b/langchain/evaluation/comparison/embedding.py
@@ -0,0 +1,188 @@
+"""A chain for comparing the output of two models using embeddings."""
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+from pydantic import Field, root_validator
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.embeddings.base import Embeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.evaluation.schema import PairwiseStringEvaluator
+from langchain.math_utils import cosine_similarity
+
+
+class EmbeddingDistance(str, Enum):
+    COSINE = "cosine"
+    EUCLIDEAN = "euclidean"
+    MANHATTAN = "manhattan"
+    CHEBYSHEV = "chebyshev"
+    HAMMING = "hamming"
+
+
+class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
+    """A chain for comparing the output of two models using embeddings."""
+
+    embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
+    """The embedding objects to vectorize the outputs."""
+    distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
+    """The distance metric to use for comparing the embeddings."""
+
+    @root_validator
+    def _validate_distance_metric(cls, values: dict) -> dict:
+        """Validate the distance metric.
+
+        Args:
+            values (dict): The values to validate.
+
+        Returns:
+            dict: The validated values.
+        """
+        values["distance_metric"] = values["distance_metric"].lower()
+        return values
+
+    def _get_metric(self, metric: EmbeddingDistance) -> Any:
+        """Get the metric function for the given metric name.
+
+        Args:
+            metric (str): The metric name.
+
+        Returns:
+            Any: The metric function.
+        """
+        metrics = {
+            EmbeddingDistance.COSINE: self._cosine_distance,
+            EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
+            EmbeddingDistance.MANHATTAN: self._manhattan_distance,
+            EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
+            EmbeddingDistance.HAMMING: self._hamming_distance,
+        }
+        if metric in metrics:
+            return metrics[metric]
+        else:
+            raise ValueError(f"Invalid metric: {metric}")
+
+    @staticmethod
+    def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+        return 1.0 - cosine_similarity(a, b)
+
+    @staticmethod
+    def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.linalg.norm(a - b)
+
+    @staticmethod
+    def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.sum(np.abs(a - b))
+
+    @staticmethod
+    def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.max(np.abs(a - b))
+
+    @staticmethod
+    def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.mean(a != b)
+
+    def _compute_score(self, vectors: np.ndarray) -> float:
+        metric = self._get_metric(self.distance_metric)
+        score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
+        return score
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["prediction", "prediction_b"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["score"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: CallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        vectors = np.array(
+            self.embeddings.embed_documents(
+                [inputs["prediction"], inputs["prediction_b"]]
+            )
+        )
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        embedded = await self.embeddings.aembed_documents(
+            [inputs["prediction"], inputs["prediction_b"]]
+        )
+        vectors = np.array(embedded)
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the embedding distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the embedding distance
+
+        between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return await self.acall(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@@ -3,13 +3,12 @@ from __future__ import annotations

 from typing import Any, Optional

-from pydantic import Extra, Field
+from pydantic import Field

 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
-from langchain.evaluation.schema import EvalChain, PairwiseStringEvaluator
 from langchain.prompts.prompt import PromptTemplate
 from langchain.schema import BaseOutputParser

@@ -51,7 +50,7 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
        }


-class PairwiseStringEvalChain(PairwiseStringEvaluator, EvalChain, LLMChain):
+class PairwiseStringEvalChain(LLMChain):
    """A chain for comparing the output of two models.

    Example:
@@ -81,16 +80,11 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, EvalChain, LLMChain):
        default_factory=PairwiseStringResultOutputParser
    )

-    class Config:
-        """Configuration for the QAEvalChain."""
-
-        extra = Extra.ignore
-
    @classmethod
    def from_llm(
        cls,
-        llm: BaseLanguageModel,
        *,
+        llm: BaseLanguageModel,
        prompt: Optional[PromptTemplate] = None,
        require_reference: bool = False,
        **kwargs: Any,
@@ -127,23 +121,14 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, EvalChain, LLMChain):
        return cls(llm=llm, prompt=prompt_, **kwargs)

    def _prepare_input(
-        self,
-        prediction: str,
-        prediction_b: str,
-        input: Optional[str],
-        reference: Optional[str],
+        self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
    ) -> dict:
        input_ = {
            "prediction": prediction,
            "prediction_b": prediction_b,
+            "input": input,
        }
-        if "input" in self.prompt.input_variables:
-            if not input:
-                raise ValueError("Input is require for this comparison evaluator")
-            input_["input"] = input
-        if "reference" in self.prompt.input_variables:
-            if reference is None:
-                raise ValueError("Reference is required for this comparison evaluator")
+        if reference is not None and "reference" in self.prompt.input_variables:
            input_["reference"] = reference
        return input_

@@ -152,7 +137,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, EvalChain, LLMChain):
        *,
        prediction: str,
        prediction_b: str,
-        input: Optional[str] = None,
+        input: str,
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@@ -188,8 +173,8 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, EvalChain, LLMChain):
        *,
        prediction: str,
        prediction_b: str,
+        input: str,
        reference: Optional[str] = None,
-        input: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
    ) -> dict:
--- a/langchain/evaluation/comparison/fuzzy_match.py
+++ b/langchain/evaluation/comparison/fuzzy_match.py
@@ -0,0 +1,129 @@
+from enum import Enum
+from typing import Any, Callable, Dict, Optional
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.schema import PairwiseStringEvaluator
+
+
+def _load_rapidfuzz() -> Any:
+    try:
+        import rapidfuzz
+    except ImportError:
+        raise ImportError(
+            "Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
+        )
+    return rapidfuzz.distance
+
+
+class StringDistance(str, Enum):
+    DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
+    LEVENSHTEIN = "levenshtein"
+    JARO = "jaro"
+    JARO_WINKLER = "jaro_winkler"
+
+
+class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
+    def __init__(
+        self,
+        distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
+    ) -> None:
+        self.metric = self._get_metric(distance)
+
+    @staticmethod
+    def _get_metric(distance: str) -> Callable:
+        rf_distance = _load_rapidfuzz()
+        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
+            return rf_distance.DamerauLevenshtein.distance
+        elif distance == StringDistance.LEVENSHTEIN:
+            return rf_distance.Levenshtein.distance
+        elif distance == StringDistance.JARO:
+            return rf_distance.Jaro.distance
+        elif distance == StringDistance.JARO_WINKLER:
+            return rf_distance.JaroWinkler.distance
+        else:
+            raise ValueError(f"Invalid distance metric: {distance}")
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: CallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - value: The preference value, which is either 'A', 'B', or None
+                    for no preference.
+                - score: The preference score, which is 1 for 'A', 0 for 'B',
+                    and 0.5 for None.
+        """
+        return self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - value: The preference value, which is either 'A', 'B', or None
+                    for no preference.
+                - score: The preference score, which is 1 for 'A', 0 for 'B',
+                    and 0.5 for None.
+        """
+        return await self.acll(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@@ -2,13 +2,12 @@ from __future__ import annotations

 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union

-from pydantic import Extra, Field
+from pydantic import Field

 from langchain.base_language import BaseLanguageModel
 from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
-from langchain.evaluation.schema import EvalChain, StringEvaluator
 from langchain.schema import BaseOutputParser, BasePromptTemplate

 _SUPPORTED_CRITERIA = {
@@ -60,7 +59,7 @@ CRITERIA_TYPE = Union[
 ]


-class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
+class CriteriaEvalChain(LLMChain):
    """LLM Chain for evaluating runs against criteria.

    Parameters
@@ -97,30 +96,10 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
    >>> chain = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
    """

+    requires_reference: bool = False
+    """Whether the evaluation template expects a reference text."""
    output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
    """The parser to use to map the output to a structured result."""
-    criteria_names: List[str] = Field(default_factory=list)
-
-    class Config:
-        """Configuration for the QAEvalChain."""
-
-        extra = Extra.ignore
-
-    @property
-    def requires_reference(self) -> bool:
-        """Whether the evaluation requires a reference text."""
-        return "reference" in self.prompt.input_variables
-
-    @property
-    def evaluation_name(self) -> str:
-        """Get the name of the evaluation.
-
-        Returns
-        -------
-        str
-            The name of the evaluation.
-        """
-        return " ".join(self.criteria_names)

    @staticmethod
    def get_supported_default_criteria() -> List[str]:
@@ -143,7 +122,7 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
    @classmethod
    def resolve_criteria(
        cls,
-        criteria: Optional[CRITERIA_TYPE],
+        criteria: CRITERIA_TYPE,
    ) -> Dict[str, str]:
        """Resolve the criteria to evaluate.

@@ -169,10 +148,6 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
        {'relevance': 'Is the submission referring to a real quote from the text?',
         'coherence': 'Is the submission coherent, well-structured, and organized?'}
        """  # noqa: E501
-        if criteria is None:
-            return {
-                "helpfulness": _SUPPORTED_CRITERIA["helpfulness"],
-            }
        if isinstance(criteria, str):
            criteria_ = {criteria: _SUPPORTED_CRITERIA[criteria]}
        elif isinstance(criteria, ConstitutionalPrinciple):
@@ -197,7 +172,7 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
    def from_llm(
        cls,
        llm: BaseLanguageModel,
-        criteria: Optional[CRITERIA_TYPE] = None,
+        criteria: CRITERIA_TYPE,
        *,
        prompt: Optional[BasePromptTemplate] = None,
        requires_reference: bool = False,
@@ -209,7 +184,7 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
        ----------
        llm : BaseLanguageModel
            The language model to use for evaluation.
-        criteria : CRITERIA_TYPE - default=None for "helpfulness"
+        criteria : CRITERIA_TYPE
            The criteria to evaluate the runs against. It can be:
                -  a mapping of criterion names to descriptions
                -  a sequence of criterion names
@@ -256,14 +231,10 @@ class CriteriaEvalChain(StringEvaluator, EvalChain, LLMChain):
            else:
                prompt = PROMPT
        criteria_ = cls.resolve_criteria(criteria)
-        criteria_names = list(criteria_.keys())
        criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
        prompt_ = prompt.partial(criteria=criteria_str)
        return cls(
-            llm=llm,
-            prompt=prompt_,
-            criteria_names=criteria_names,
-            **kwargs,
+            llm=llm, prompt=prompt_, requires_reference=requires_reference, **kwargs
        )

    def _get_eval_input(
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@@ -1,107 +1,8 @@
-"""Loading datasets and evaluators."""
-from typing import Any, Dict, List, Optional, Sequence, Type
-
-from langchain.base_language import BaseLanguageModel
-from langchain.chains.base import Chain
-from langchain.chat_models.openai import ChatOpenAI
-from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
-from langchain.evaluation.comparison import PairwiseStringEvalChain
-from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
-from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import EvalChain, EvaluatorType
+from typing import Dict, List


 def load_dataset(uri: str) -> List[Dict]:
-    """Load a dataset from the LangChainDatasets HuggingFace org."""
    from datasets import load_dataset

    dataset = load_dataset(f"LangChainDatasets/{uri}")
    return [d for d in dataset["train"]]
-
-
-_EVALUATOR_MAP: Dict[EvaluatorType, Type[EvalChain]] = {
-    EvaluatorType.QA: QAEvalChain,
-    EvaluatorType.COT_QA: CotQAEvalChain,
-    EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
-    EvaluatorType.PAIRWISE_STRING: PairwiseStringEvalChain,
-    EvaluatorType.AGENT_TRAJECTORY: TrajectoryEvalChain,
-    EvaluatorType.CRITERIA: CriteriaEvalChain,
-}
-
-
-def _load_evaluator(
-    evaluator: EvaluatorType,
-    *,
-    llm: Optional[BaseLanguageModel] = None,
-    **kwargs: Any,
-) -> Chain:
-    """Load the requested evaluation chain specified by a string.
-
-    Parameters
-    ----------
-    evaluator : EvaluatorType
-        The type of evaluator to load.
-    llm : BaseLanguageModel, optional
-        The language model to use for evaluation, by default None
-    **kwargs : Any
-        Additional keyword arguments to pass to the evaluator.
-
-    Returns
-    -------
-    Chain
-        The loaded evaluation chain.
-
-    Examples
-    --------
-    >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
-    >>> evaluator = _load_evaluator("qa", llm=llm)
-    """
-    llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
-    if evaluator not in _EVALUATOR_MAP:
-        raise ValueError(
-            f"Unknown evaluator type: {evaluator}"
-            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
-        )
-    return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)
-
-
-def load_evaluators(
-    evaluators: Sequence[EvaluatorType],
-    *,
-    llm: Optional[BaseLanguageModel] = None,
-    config: Optional[dict] = None,
-    **kwargs: Any,
-) -> List[Chain]:
-    """Load evaluators specified by a list of evaluator types.
-
-    Parameters
-    ----------
-    evaluators : Sequence[EvaluatorType]
-        The list of evaluator types to load.
-    llm : BaseLanguageModel, optional
-        The language model to use for evaluation, if none is provided, a default
-        ChatOpenAI gpt-4 model will be used.
-    config : dict, optional
-        A dictionary mapping evaluator types to additional keyword arguments,
-        by default None
-    **kwargs : Any
-        Additional keyword arguments to pass to all evaluators.
-
-    Returns
-    -------
-    List[Chain]
-        The loaded evaluators.
-
-    Examples
-    --------
-    .. code-block:: python
-        from langchain.evaluation import load_evaluators, EvaluatorType
-        evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA]
-        loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness")
-    """
-    llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
-    loaded = []
-    for evaluator in evaluators:
-        _kwargs = config.get(evaluator, {}) if config else {}
-        loaded.append(_load_evaluator(evaluator, llm=llm, **{**kwargs, **_kwargs}))
-    return loaded
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@@ -3,14 +3,11 @@ from __future__ import annotations

 from typing import Any, List, Optional, Sequence

-from pydantic import Extra
-
 from langchain import PromptTemplate
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
 from langchain.evaluation.qa.eval_prompt import CONTEXT_PROMPT, COT_PROMPT, PROMPT
-from langchain.evaluation.schema import EvalChain, StringEvaluator


 def _parse_string_eval_output(text: str) -> dict:
@@ -41,22 +38,9 @@ def _parse_string_eval_output(text: str) -> dict:
    }


-class QAEvalChain(LLMChain, StringEvaluator, EvalChain):
+class QAEvalChain(LLMChain):
    """LLM Chain specifically for evaluating question answering."""

-    class Config:
-        """Configuration for the QAEvalChain."""
-
-        extra = Extra.ignore
-
-    @property
-    def evaluation_name(self) -> str:
-        return "correctness"
-
-    @property
-    def requires_reference(self) -> bool:
-        return True
-
    @classmethod
    def from_llm(
        cls, llm: BaseLanguageModel, prompt: PromptTemplate = PROMPT, **kwargs: Any
@@ -150,7 +134,7 @@ class QAEvalChain(LLMChain, StringEvaluator, EvalChain):
        return _parse_string_eval_output(result["text"])


-class ContextQAEvalChain(LLMChain, StringEvaluator, EvalChain):
+class ContextQAEvalChain(LLMChain):
    """LLM Chain specifically for evaluating QA w/o GT based on context"""

    @classmethod
@@ -162,10 +146,6 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, EvalChain):
                f"but got {prompt.input_variables}"
            )

-    @property
-    def evaluation_name(self) -> str:
-        return "Contextual Accuracy"
-
    @classmethod
    def from_llm(
        cls,
@@ -246,10 +226,6 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, EvalChain):
 class CotQAEvalChain(ContextQAEvalChain):
    """LLM Chain specifically for evaluating QA using chain of thought reasoning."""

-    @property
-    def evaluation_name(self) -> str:
-        return "COT Contextual Accuracy"
-
    @classmethod
    def from_llm(
        cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any
--- a/langchain/evaluation/run_evaluators/init.py
+++ b/langchain/evaluation/run_evaluators/init.py
@@ -11,9 +11,6 @@ from langchain.evaluation.run_evaluators.implementations import (
    get_qa_evaluator,
    get_trajectory_evaluator,
 )
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)

 __all__ = [
    "RunEvaluatorChain",
@@ -24,5 +21,4 @@ __all__ = [
    "get_trajectory_evaluator",
    "StringRunEvaluatorInputMapper",
    "ChoicesOutputParser",
-    "StringRunEvaluatorChain",
 ]
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@@ -21,10 +21,6 @@ class RunEvaluatorInputMapper:
    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
        """Maps the Run and Optional[Example] to a dictionary"""

-    def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
-        """Maps the Run and Optional[Example] to a dictionary"""
-        return self.map(run, example)
-

 class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
    """Parse the output of a run."""
--- a/langchain/evaluation/run_evaluators/loading.py
+++ b/langchain/evaluation/run_evaluators/loading.py
@@ -1,69 +0,0 @@
-""""Loading helpers for run evaluators."""
-
-
-from typing import Any, List, Optional, Sequence, Union
-
-from langchainplus_sdk import RunEvaluator
-
-from langchain.base_language import BaseLanguageModel
-from langchain.chains.base import Chain
-from langchain.evaluation.loading import load_evaluators
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)
-from langchain.evaluation.schema import EvaluatorType, StringEvaluator
-from langchain.tools.base import Tool
-
-
-def load_run_evaluators_for_model(
-    evaluators: Sequence[EvaluatorType],
-    model: Union[Chain, BaseLanguageModel, Tool],
-    *,
-    input_key: Optional[str] = None,
-    prediction_key: Optional[str] = None,
-    reference_key: Optional[str] = None,
-    eval_llm: Optional[BaseLanguageModel] = None,
-    **kwargs: Any,
-) -> List[RunEvaluator]:
-    """Load evaluators specified by a list of evaluator types.
-
-    Parameters
-    ----------
-    evaluators : Sequence[EvaluatorType]
-        The list of evaluator types to load.
-    model : Union[Chain, BaseLanguageModel, Tool]
-        The model to evaluate. Used to infer how to parse the run.
-    input_key : Optional[str], a chain run's input key to map
-        to the evaluator's input
-    prediction_key : Optional[str], the key in the run's outputs to
-        represent the Chain prediction
-    reference_key : Optional[str], the key in the dataset example (row)
-        outputs to represent the reference, or ground-truth label
-    eval_llm : BaseLanguageModel, optional
-        The language model to use for evaluation, if none is provided, a default
-        ChatOpenAI gpt-4 model will be used.
-    **kwargs : Any
-        Additional keyword arguments to pass to all evaluators.
-
-    Returns
-    -------
-    List[RunEvaluator]
-        The loaded Run evaluators.
-    """
-    evaluators_ = load_evaluators(evaluators, llm=eval_llm, **kwargs)
-    run_evaluators = []
-    for evaluator in evaluators_:
-        if isinstance(evaluator, StringEvaluator):
-            run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-                model,
-                evaluator,
-                input_key=input_key,
-                prediction_key=prediction_key,
-                reference_key=reference_key,
-            )
-        else:
-            raise NotImplementedError(
-                f"Run evaluator for {evaluator} is not implemented"
-            )
-        run_evaluators.append(run_evaluator)
-    return run_evaluators
--- a/langchain/evaluation/run_evaluators/message_run_evaluator.py
+++ b/langchain/evaluation/run_evaluators/message_run_evaluator.py
@@ -1,138 +0,0 @@
-"""Run evaluator mapper for message evaluators."""
-
-from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Union, TypedDict
-from langchain.schema import BaseMessage
-
-from langchainplus_sdk.schemas import Example, Run
-from langchain.load.serializable import Serializable
-from langchain.schema import messages_from_dict
-
-
-class RunMapping(TypedDict):
-    prediction: BaseMessage
-    input: List[BaseMessage]
-
-
-class ExampleMapping(TypedDict):
-    reference: BaseMessage
-
-
-class MessageRunMapper(Serializable):
-    """Extract items to evaluate from run object"""
-
-    @property
-    def output_keys(self) -> List[str]:
-        """The keys to extract from the run."""
-        return ["prediction", "input"]
-
-    @abstractmethod
-    def map(self, run: Run) -> RunMapping:
-        """Maps the Run to a dictionary."""
-
-    def __call__(self, run: Run) -> RunMapping:
-        """Maps the Run to a dictionary."""
-        if not run.outputs:
-            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
-        return self.map(run)
-
-
-class MessageExampleMapper(Serializable):
-    """Map an example, or row in the dataset, to the inputs of an evaluation."""
-
-    reference_key: Optional[str] = None
-
-    @property
-    def output_keys(self) -> List[str]:
-        """The keys to extract from the run."""
-        return ["reference"]
-
-    def map(self, example: Example) -> ExampleMapping:
-        """Maps the Example, or dataset row to a dictionary."""
-        if not example.outputs:
-            raise ValueError(
-                f"Example {example.id} has no outputs to use as a reference."
-            )
-        if self.reference_key is None:
-            if len(example.outputs) > 1:
-                raise ValueError(
-                    f"Example {example.id} has multiple outputs, so you must"
-                    " specify a reference_key."
-                )
-            else:
-                output = list(example.outputs.values())[0]
-                return {
-                    "reference": output if isinstance(output, BaseMessage) else messages_from_dict([output])[0]
-                }
-        elif self.reference_key not in example.outputs:
-            raise ValueError(
-                f"Example {example.id} does not have reference key"
-                f" {self.reference_key}."
-            )
-        output = example.outputs[self.reference_key]
-        return {"reference": output if isinstance(output, BaseMessage) else messages_from_dict([output])[0]}
-
-    def __call__(self, example: Example) -> ExampleMapping:
-        """Maps the Run and Example to a dictionary."""
-        if not example.outputs:
-            raise ValueError(
-                f"Example {example.id} has no outputs to use as areference label."
-            )
-        return self.map(example)
-
-
-class ChatModelMessageRunMapper(MessageRunMapper):
-    """Extract items to evaluate from run object."""
-
-    @staticmethod
-    def extract_inputs(inputs: Dict) -> List[BaseMessage]:
-        if not inputs.get("messages"):
-            raise ValueError("Run must have messages as inputs.")
-        if "messages" in inputs:
-            if isinstance(inputs["messages"], list) and inputs["messages"]:
-                if isinstance(inputs["messages"][0], BaseMessage):
-                    return messages_from_dict(inputs["messages"])
-                elif isinstance(inputs["messages"][0], list):
-                    # Runs from Tracer have messages as a list of lists of dicts
-                    return messages_from_dict(inputs["messages"][0])
-        raise ValueError(f"Could not extract messages from inputs: {inputs}")
-
-    @staticmethod
-    def extract_outputs(outputs: Dict) -> BaseMessage:
-        if not outputs.get("generations"):
-            raise ValueError("LLM Run must have generations as outputs.")
-        first_generation: Dict = outputs["generations"][0]
-        if isinstance(first_generation, list):
-            # Runs from Tracer have generations as a list of lists of dicts
-            # Whereas Runs from the API have a list of dicts
-            first_generation = first_generation[0]
-        if "message" in first_generation:
-            return messages_from_dict([first_generation["message"]])[0]
-
-    def map(self, run: Run) -> RunMapping:
-        """Maps the Run to a dictionary."""
-        if run.run_type != "llm":
-            raise ValueError("ChatModel RunMapper only supports LangSmith runs of type llm.")
-        elif not run.outputs:
-            if run.error:
-                raise ValueError(
-                    f"Cannot evaluate errored LLM run {run.id}: {run.error}"
-                )
-            else:
-                raise ValueError(
-                    f"Run {run.id} has no outputs. Cannot evaluate this run."
-                )
-        else:
-            try:
-                inputs = self.extract_inputs(run.inputs)
-            except Exception as e:
-                raise ValueError(
-                    f"Could not parse LM input from run inputs {run.inputs}"
-                ) from e
-            try:
-                output_ = self.extract_outputs(run.outputs)
-            except Exception as e:
-                raise ValueError(
-                    f"Could not parse LM prediction from run outputs {run.outputs}"
-                ) from e
-            return {"input": inputs, "prediction": output_}
--- a/langchain/evaluation/run_evaluators/string_run_evaluator.py
+++ b/langchain/evaluation/run_evaluators/string_run_evaluator.py
@@ -1,392 +0,0 @@
-"""Run evaluator wrapper for string evaluators."""
-from __future__ import annotations
-
-from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Union, Protocol
-
-from langchainplus_sdk import EvaluationResult, RunEvaluator
-from langchainplus_sdk.schemas import Example, Run
-
-from langchain.base_language import BaseLanguageModel
-from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
-    CallbackManagerForChainRun,
-)
-from langchain.chains.base import Chain
-from langchain.evaluation.schema import StringEvaluator, MessageEvaluator
-from langchain.load.serializable import Serializable
-from langchain.schema import RUN_KEY, get_buffer_string, messages_from_dict
-from langchain.tools.base import Tool
-
-
-class StringRunMapper(Serializable):
-    """Extract items to evaluate from the run object."""
-
-    @property
-    def output_keys(self) -> List[str]:
-        """The keys to extract from the run."""
-        return ["prediction", "input"]
-
-    @abstractmethod
-    def map(self, run: Run) -> Dict[str, str]:
-        """Maps the Run to a dictionary."""
-
-    def __call__(self, run: Run) -> Dict[str, str]:
-        """Maps the Run to a dictionary."""
-        if not run.outputs:
-            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
-        return self.map(run)
-
-
-class LLMStringRunMapper(StringRunMapper):
-    """Extract items to evaluate from the run object."""
-
-    def serialize_chat_messages(self, messages: List[Dict]) -> str:
-        """Extract the input messages from the run."""
-        chat_messages = messages_from_dict(messages)
-        return get_buffer_string(chat_messages)
-
-    def serialize_inputs(self, inputs: Dict) -> str:
-        if "prompts" in inputs:  # Should we even accept this?
-            input_ = "\n\n".join(inputs["prompts"])
-        elif "prompt" in inputs:
-            input_ = inputs["prompt"]
-        elif "messages" in inputs:
-            input_ = self.serialize_chat_messages(inputs["messages"])
-        else:
-            raise ValueError("LLM Run must have either messages or prompts as inputs.")
-        return input_
-
-    def serialize_outputs(self, outputs: Dict) -> str:
-        if not outputs.get("generations"):
-            raise ValueError("LLM Run must have generations as outputs.")
-        first_generation: Dict = outputs["generations"][0]
-        if isinstance(first_generation, list):
-            # Runs from Tracer have generations as a list of lists of dicts
-            # Whereas Runs from the API have a list of dicts
-            first_generation = first_generation[0]
-        if "message" in first_generation:
-            output_ = self.serialize_chat_messages([first_generation["message"]])
-        else:
-            output_ = first_generation["text"]
-        return output_
-
-    def map(self, run: Run) -> Dict[str, str]:
-        """Maps the Run to a dictionary."""
-        if run.run_type != "llm":
-            raise ValueError("LLM RunMapper only supports LLM runs.")
-        elif not run.outputs:
-            if run.error:
-                raise ValueError(
-                    f"Cannot evaluate errored LLM run {run.id}: {run.error}"
-                )
-            else:
-                raise ValueError(
-                    f"Run {run.id} has no outputs. Cannot evaluate this run."
-                )
-        else:
-            try:
-                inputs = self.serialize_inputs(run.inputs)
-            except Exception as e:
-                raise ValueError(
-                    f"Could not parse LM input from run inputs {run.inputs}"
-                ) from e
-            try:
-                output_ = self.serialize_outputs(run.outputs)
-            except Exception as e:
-                raise ValueError(
-                    f"Could not parse LM prediction from run outputs {run.outputs}"
-                ) from e
-            return {"input": inputs, "prediction": output_}
-
-
-class ChainStringRunMapper(StringRunMapper):
-    """Extract items to evaluate from the run object from a chain."""
-
-    input_key: str
-    """The key from the chain Run's inputs to use as the eval input."""
-    prediction_key: str
-    """The key from the chain Run's outputs to use as the eval prediction."""
-
-    @classmethod
-    def from_chain(
-        cls,
-        model: Chain,
-        input_key: Optional[str] = None,
-        prediction_key: Optional[str] = None,
-    ) -> ChainStringRunMapper:
-        """Create a RunMapper from a chain."""
-        error_messages = []
-        if input_key is None:
-            if len(model.input_keys) > 1:
-                error_messages.append(
-                    f"Chain {model.lc_namespace} has multiple input"
-                    " keys. Please specify 'input_key' when loading."
-                )
-            else:
-                input_key = model.input_keys[0]
-        elif input_key not in model.input_keys:
-            error_messages.append(
-                f"Chain {model.lc_namespace} does not have specified"
-                f" input key {input_key}."
-            )
-        if prediction_key is None:
-            if len(model.output_keys) > 1:
-                error_messages.append(
-                    f"Chain {model.lc_namespace} has multiple"
-                    " output keys. Please specify 'prediction_key' when loading."
-                )
-            else:
-                prediction_key = model.output_keys[0]
-        elif prediction_key not in model.output_keys:
-            error_messages.append(
-                f"Chain {model.lc_namespace} does not have specified"
-                f" prediction_key {prediction_key}."
-            )
-        if error_messages:
-            raise ValueError("\n".join(error_messages))
-        if input_key is None or prediction_key is None:
-            # This should never happen, but mypy doesn't know that.
-            raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
-        return cls(input_key=input_key, prediction_key=prediction_key)
-
-    def map(self, run: Run) -> Dict[str, str]:
-        """Maps the Run to a dictionary."""
-        if not run.outputs:
-            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
-        if run.run_type != "chain":
-            raise ValueError("Chain RunMapper only supports Chain runs.")
-        if self.input_key not in run.inputs:
-            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
-        elif self.prediction_key not in run.outputs:
-            raise ValueError(
-                f"Run {run.id} does not have prediction key {self.prediction_key}."
-            )
-        else:
-            return {
-                "input": run.inputs[self.input_key],
-                "prediction": run.outputs[self.prediction_key],
-            }
-
-
-class ToolStringRunMapper(StringRunMapper):
-    """Map an input to the tool."""
-
-    def map(self, run: Run) -> Dict[str, str]:
-        if not run.outputs:
-            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
-        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
-
-
-class StringExampleMapper(Serializable):
-    """Map an example, or row in the dataset, to the inputs of an evaluation."""
-
-    reference_key: Optional[str] = None
-
-    @property
-    def output_keys(self) -> List[str]:
-        """The keys to extract from the run."""
-        return ["reference"]
-
-    def serialize_chat_messages(self, messages: List[Dict]) -> str:
-        """Extract the input messages from the run."""
-        chat_messages = messages_from_dict(messages)
-        return get_buffer_string(chat_messages)
-
-    def map(self, example: Example) -> Dict[str, str]:
-        """Maps the Example, or dataset row to a dictionary."""
-        if not example.outputs:
-            raise ValueError(
-                f"Example {example.id} has no outputs to use as a reference."
-            )
-        if self.reference_key is None:
-            if len(example.outputs) > 1:
-                raise ValueError(
-                    f"Example {example.id} has multiple outputs, so you must"
-                    " specify a reference_key."
-                )
-            else:
-                output = list(example.outputs.values())[0]
-                return {
-                    "reference": output
-                    if type(output) == str
-                    else self.serialize_chat_messages([output])
-                }
-        elif self.reference_key not in example.outputs:
-            raise ValueError(
-                f"Example {example.id} does not have reference key"
-                f" {self.reference_key}."
-            )
-        return {"reference": example.outputs[self.reference_key]}
-
-    def __call__(self, example: Example) -> Dict[str, Any]:
-        """Maps the Run and Example to a dictionary."""
-        if not example.outputs:
-            raise ValueError(
-                f"Example {example.id} has no outputs to use as areference label."
-            )
-        return self.map(example)
-
-
-# TODO(agola11) can make these abstract classes
-class BaseRunMapper(Protocol):
-    def map(self, run: Run) -> Dict[str, Any]: ...
-
-
-class BaseExampleMapper(Protocol):
-    def map(self, example: Example) -> Dict[str, Any]: ...
-
-
-class SimpleRunEvaluatorChain(Chain, RunEvaluator):
-    """Evaluate Run and optional examples."""
-
-    run_mapper: BaseRunMapper
-    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""
-    example_mapper: Optional[BaseExampleMapper] = None
-    """Maps the Example (dataset row) to a dictionary
-    with a 'reference' string."""
-    name: str
-    """The name of the evaluation metric."""
-    evaluator: Union[StringEvaluator, MessageEvaluator]
-    """The evaluation chain."""
-
-    @property
-    def input_keys(self) -> List[str]:
-        return ["run", "example"]
-
-    @property
-    def output_keys(self) -> List[str]:
-        return ["feedback"]
-
-    def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        run: Run = inputs["run"]
-        example: Optional[Example] = inputs.get("example")
-        evaluate_inputs = self.run_mapper.map(run)
-        if self.example_mapper:
-            if not example:
-                raise ValueError(
-                    f"Evaluator {self.name} requires an reference"
-                    " example from the dataset,"
-                    f" but none was provided for run {run.id}."
-                )
-            evaluate_inputs.update(self.example_mapper.map(example))
-        return evaluate_inputs
-
-    def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
-        evaluation_result = EvaluationResult(key=self.name, **output)
-        if RUN_KEY in output:
-            # TODO: Not currently surfaced. Update
-            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
-        return evaluation_result
-
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: Optional[CallbackManagerForChainRun] = None,
-    ) -> Dict[str, Any]:
-        evaluate_inputs = self._prepare_input(inputs)
-        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
-        callbacks = _run_manager.get_child()
-
-        if isinstance(self.evaluator, StringEvaluator):
-            chain_output = self.evaluator.evaluate_strings(
-                **evaluate_inputs,
-                callbacks=callbacks,
-            )
-        elif isinstance(self.evaluator, MessageEvaluator):
-            chain_output = self.evaluator.evaluate_messages(
-                **evaluate_inputs,
-                callbacks=callbacks,
-            )
-        else:
-            raise ValueError("Unsupported evaluator type")
-
-        evaluation_result = self._prepare_output(chain_output)
-        return {"feedback": evaluation_result}
-
-    async def _acall(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: AsyncCallbackManagerForChainRun | None = None,
-    ) -> Dict[str, Any]:
-        evaluate_inputs = self._prepare_input(inputs)
-        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
-        callbacks = _run_manager.get_child()
-
-        if isinstance(self.evaluator, StringEvaluator):
-            chain_output = await self.evaluator.aevaluate_strings(
-                **evaluate_inputs,
-                callbacks=callbacks,
-            )
-        elif isinstance(self.evaluator, MessageEvaluator):
-            chain_output = await self.evaluator.aevaluate_messages(
-                **evaluate_inputs,
-                callbacks=callbacks,
-            )
-        else:
-            raise ValueError("Unsupported evaluator type")
-
-        evaluation_result = self._prepare_output(chain_output)
-        return {"feedback": evaluation_result}
-
-    def evaluate_run(
-        self, run: Run, example: Optional[Example] = None
-    ) -> EvaluationResult:
-        """Evaluate an example."""
-        return self({"run": run, "example": example})["feedback"]
-
-    async def aevaluate_run(
-        self, run: Run, example: Optional[Example] = None
-    ) -> EvaluationResult:
-        """Evaluate an example."""
-        result = await self.acall({"run": run, "example": example})
-        return result["feedback"]
-
-    # TODO: Add ability to load message evaluators
-    @classmethod
-    def from_model_and_evaluator(
-        cls,
-        model: Union[Chain, BaseLanguageModel, Tool],
-        evaluator: Union[StringEvaluator, MessageEvaluator],
-        input_key: Optional[str] = None,
-        prediction_key: Optional[str] = None,
-        reference_key: Optional[str] = None,
-    ) -> SimpleRunEvaluatorChain:
-        """Create a StringRunEvaluatorChain from a model and evaluator."""
-
-        if isinstance(evaluator, StringEvaluator):
-            if isinstance(model, BaseLanguageModel):
-                run_mapper: StringRunMapper = LLMStringRunMapper()
-            elif isinstance(model, Chain):
-                run_mapper = ChainStringRunMapper.from_chain(
-                    model, input_key=input_key, prediction_key=prediction_key
-                )
-            elif isinstance(model, Tool):
-                run_mapper = ToolStringRunMapper()
-            else:
-                raise NotImplementedError(
-                    f"{cls.__name__}.from_model_and_evaluator({type(model)})"
-                    " not yet implemented."
-                    "Expected one of [BaseLanguageModel, Chain, Tool]."
-                )
-            if reference_key is not None or isinstance(model, BaseLanguageModel):
-                example_mapper = StringExampleMapper(reference_key=reference_key)
-            elif evaluator.requires_reference:
-                raise ValueError(
-                    f"Evaluator {evaluator.evaluation_name} requires a reference"
-                    " example from the dataset. Please specify the reference key from"
-                    " amongst the dataset outputs keys."
-                )
-            else:
-                example_mapper = None
-        elif isinstance(evaluator, MessageEvaluator):
-            raise NotImplementedError()
-        else:
-            raise NotImplementedError()
-
-        return cls(
-            name=evaluator.evaluation_name,
-            run_mapper=run_mapper,
-            example_mapper=example_mapper,
-            evaluator=evaluator,
-    )
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -1,56 +1,12 @@
 """Interfaces to be implemented by general evaluators."""
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from enum import Enum
-from typing import Any, Optional, List
-
-from langchain.base_language import BaseLanguageModel
-from langchain.chains.base import Chain
-from langchain.schema import BaseMessage, get_buffer_string
+from abc import abstractmethod
+from typing import Any, Optional, Protocol, runtime_checkable


-class EvaluatorType(str, Enum):
-    """The types of the evaluators."""
-
-    QA = "qa"
-    """Question answering evaluator, which grades answers to questions
-    directly using an LLM."""
-    COT_QA = "cot_qa"
-    """Chain of thought question answering evaluator, which grades
-    answers to questions using
-    chain of thought 'reasoning'."""
-    CONTEXT_QA = "context_qa"
-    """Question answering evaluator that incorporates 'context' in the response."""
-    PAIRWISE_STRING = "pairwise_string"
-    """The pairwise string evaluator, which compares the output of two models."""
-    AGENT_TRAJECTORY = "trajectory"
-    """The agent trajectory evaluator, which grades the agent's intermediate steps."""
-    CRITERIA = "criteria"
-    """The criteria evaluator, which evaluates a model based on a
-    custom set of criteria."""
-
-
-class EvalChain(Chain):
-    """A base class for evaluators that use an LLM."""
-
-    @classmethod
-    @abstractmethod
-    def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> EvalChain:
-        """Create a new evaluator from an LLM."""
-
-
-class StringEvaluator(ABC):
+@runtime_checkable
+class StringEvaluator(Protocol):
    """Protocol for evaluating strings."""

-    @property
-    def evaluation_name(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def requires_reference(self) -> bool:
-        return False
-
    @abstractmethod
    def evaluate_strings(
        self,
@@ -70,10 +26,6 @@ class StringEvaluator(ABC):
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
-                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
        """

    async def aevaluate_strings(
@@ -95,10 +47,6 @@ class StringEvaluator(ABC):
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
-                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} hasn't implemented an "
@@ -106,114 +54,8 @@ class StringEvaluator(ABC):
        )


-class MessageEvaluator(ABC):
-    """Protocol for evaluating messages."""
-
-    @property
-    def evaluation_name(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def requires_reference(self) -> bool:
-        return False
-
-    @abstractmethod
-    def evaluate_messages(
-        self,
-        *,
-        prediction: BaseMessage,
-        reference: Optional[BaseMessage] = None,
-        input: Optional[List[BaseMessage]] = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Evaluate Chain or LLM output, based on optional input and label.
-
-        Args:
-            prediction (BaseMessage): the prediction to evaluate.
-            reference (Optional[BaseMessage], optional): the reference label
-                to evaluate against.
-            input (Optional[List[BaseMessage]], optional): the input to consider during
-                evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
-        Returns:
-            dict: The evaluation results containing the score or value.
-                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
-        """
-
-    async def aevaluate_messages(
-        self,
-        *,
-        prediction: BaseMessage,
-        reference: Optional[BaseMessage] = None,
-        input: Optional[List[BaseMessage]] = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Asynchronously evaluate Chain or LLM output, based on optional
-          input and label.
-
-        Args:
-            prediction (BaseMessage): the prediction to evaluate.
-            reference (Optional[BaseMessage], optional): the reference label
-                 to evaluate against.
-            input (Optional[List[BaseMessage]], optional): the input to consider during
-                evaluation
-            **kwargs: additional keyword arguments, including callbacks, tags, etc.
-        Returns:
-            dict: The evaluation results containing the score or value.
-                It is recommended that the dictionary contain the following keys:
-                    - score: the score of the evaluation, if applicable.
-                    - value: the string value of the evaluation, if applicable.
-                    - reasoning: the reasoning for the evaluation, if applicable.
-        """
-        raise NotImplementedError(
-            f"{self.__class__.__name__} hasn't implemented an "
-            "async aevaluate_messages method."
-        )
-
-
-
-# TODO(agola11): move this out of schema
-class SimpleMessageEvaluator(MessageEvaluator):
-    """Simple implementation of MessageEvaluator that delegates to a StringEvaluator."""
-
-    def __init__(self, string_evaluator: StringEvaluator):
-        self.string_evaluator = string_evaluator
-
-    def evaluate_messages(
-        self,
-        *,
-        prediction: BaseMessage,
-        reference: Optional[BaseMessage] = None,
-        input: Optional[List[BaseMessage]] = None,
-        **kwargs: Any,
-    ) -> dict:
-        return self.string_evaluator.evaluate_strings(
-            prediction=get_buffer_string([prediction]),
-            reference=get_buffer_string([reference]) if reference else None,
-            input=get_buffer_string(input) if input else None,
-            **kwargs,
-        )
-
-    async def aevaluate_messages(
-        self,
-        *,
-        prediction: BaseMessage,
-        reference: Optional[BaseMessage] = None,
-        input: Optional[List[BaseMessage]] = None,
-        **kwargs: Any,
-    ) -> dict:
-        return await self.string_evaluator.aevaluate_strings(
-            prediction=get_buffer_string([prediction]),
-            reference=get_buffer_string([reference]) if reference else None,
-            input=get_buffer_string(input) if input else None,
-            **kwargs,
-        )
-
-
-class PairwiseStringEvaluator(ABC):
+@runtime_checkable
+class PairwiseStringEvaluator(Protocol):
    """A protocol for comparing the output of two models."""

    @abstractmethod
@@ -244,7 +86,6 @@ class PairwiseStringEvaluator(ABC):

    async def aevaluate_string_pairs(
        self,
-        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
--- a/langchain/schema/messages.py
+++ b/langchain/schema/messages.py
@@ -168,7 +168,7 @@ def _message_from_dict(message: dict) -> BaseMessage:
    elif _type == "chat":
        return ChatMessage(**message["data"])
    else:
-        raise ValueError(f"Got unexpected message type: {_type}")
+        raise ValueError(f"Got unexpected type: {_type}")


 def messages_from_dict(messages: List[dict]) -> List[BaseMessage]:
--- a/tests/integration_tests/evaluation/comparison/test_embedding.py
+++ b/tests/integration_tests/evaluation/comparison/test_embedding.py
@@ -0,0 +1,98 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+
+from langchain.evaluation.comparison.embedding import (
+    EmbeddingDistance,
+    PairwiseEmbeddingStringEvalChain,
+)
+
+
+@pytest.fixture
+def vectors() -> Tuple[np.ndarray, np.ndarray]:
+    """Create two random vectors."""
+    np.random.seed(0)
+    vector_a = np.random.rand(10)
+    vector_b = np.random.rand(10)
+    return vector_a, vector_b
+
+
+@pytest.fixture
+def chain() -> PairwiseEmbeddingStringEvalChain:
+    """Create a PairwiseEmbeddingStringEvalChain."""
+    return PairwiseEmbeddingStringEvalChain()
+
+
+@pytest.mark.requires("scipy")
+def test_cosine_similarity(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the cosine similarity."""
+    chain.distance_metric = EmbeddingDistance.COSINE
+    result = chain._compute_score(np.array(vectors))
+    expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
+        np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
+    )
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_euclidean_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the euclidean distance."""
+    from scipy.spatial.distance import euclidean
+
+    chain.distance_metric = EmbeddingDistance.EUCLIDEAN
+    result = chain._compute_score(np.array(vectors))
+    expected = euclidean(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_manhattan_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the manhattan distance."""
+    from scipy.spatial.distance import cityblock
+
+    chain.distance_metric = EmbeddingDistance.MANHATTAN
+    result = chain._compute_score(np.array(vectors))
+    expected = cityblock(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_chebyshev_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the chebyshev distance."""
+    from scipy.spatial.distance import chebyshev
+
+    chain.distance_metric = EmbeddingDistance.CHEBYSHEV
+    result = chain._compute_score(np.array(vectors))
+    expected = chebyshev(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_hamming_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the hamming distance."""
+    from scipy.spatial.distance import hamming
+
+    chain.distance_metric = EmbeddingDistance.HAMMING
+    result = chain._compute_score(np.array(vectors))
+    expected = hamming(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("openai", "tiktoken")
+def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
+    """Test the embedding distance."""
+    result = chain.evaluate_string_pairs(
+        prediction="A single cat", prediction_b="A single cat"
+    )
+    assert np.isclose(result["score"], 0.0)
--- a/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py
@@ -1,15 +1,13 @@
 """Test agent trajectory evaluation chain."""

-from typing import Any, Dict, List, Optional, Tuple
+from typing import List, Tuple

 import pytest
-from pydantic import Field

-from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
-from langchain.schema import AgentAction, BaseMessage
+from langchain.schema import AgentAction
 from langchain.tools.base import tool
-from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM


@pytest.fixture
@@ -32,31 +30,10 @@ def foo(bar: str) -> str:
    return bar


-class _FakeTrajectoryChatModel(FakeChatModel):
-    queries: Dict = Field(default_factory=dict)
-    sequential_responses: Optional[bool] = False
-    response_index: int = 0
-
-    def _call(
-        self,
-        messages: List[BaseMessage],
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> str:
-        if self.sequential_responses:
-            response = self.queries[list(self.queries.keys())[self.response_index]]
-            self.response_index = self.response_index + 1
-            return response
-        else:
-            prompt = messages[0].content
-            return self.queries[prompt]
-
-
 def test_trajectory_eval_chain(
    intermediate_steps: List[Tuple[AgentAction, str]]
 ) -> None:
-    llm = _FakeTrajectoryChatModel(
+    llm = FakeLLM(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
@@ -84,7 +61,7 @@ def test_trajectory_eval_chain(
 def test_trajectory_eval_chain_no_tools(
    intermediate_steps: List[Tuple[AgentAction, str]]
 ) -> None:
-    llm = _FakeTrajectoryChatModel(
+    llm = FakeLLM(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
@@ -108,7 +85,7 @@ def test_trajectory_eval_chain_no_tools(


 def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> None:
-    llm = _FakeTrajectoryChatModel(
+    llm = FakeLLM(
        queries={
            "a": "Trajectory good\nScore: 5",
            "b": "Trajectory not good\nScore: 1",
--- a/tests/unit_tests/evaluation/criteria/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/criteria/test_eval_chain.py
@@ -32,4 +32,4 @@ def test_criteria_eval_chain() -> None:


 def test_implements_string_protocol() -> None:
-    assert issubclass(CriteriaEvalChain, StringEvaluator)
+    assert isinstance(CriteriaEvalChain, StringEvaluator)
--- a/tests/unit_tests/evaluation/qa/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/qa/test_eval_chain.py
@@ -52,7 +52,7 @@ def test_context_eval_chain(chain_cls: Type[ContextQAEvalChain]) -> None:
 def test_implements_string_evaluator_protocol(
    chain_cls: Type[LLMChain],
 ) -> None:
-    assert issubclass(chain_cls, StringEvaluator)
+    assert isinstance(chain_cls, StringEvaluator)


@pytest.mark.parametrize("chain_cls", [QAEvalChain, ContextQAEvalChain, CotQAEvalChain])
--- a/tests/unit_tests/evaluation/run_evaluators/test_loading.py
+++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
@@ -1,114 +0,0 @@
-"""Test the loading function for evalutors."""
-
-from unittest.mock import MagicMock
-
-import pytest
-
-from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
-from langchain.evaluation.loading import load_evaluators
-from langchain.evaluation.run_evaluators.string_run_evaluator import (
-    StringRunEvaluatorChain,
-)
-from langchain.evaluation.schema import StringEvaluator
-from tests.unit_tests.chains.test_base import FakeChain
-from tests.unit_tests.llms.fake_chat_model import FakeChatModel
-from tests.unit_tests.llms.fake_llm import FakeLLM
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeLLM(
-        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
-    )
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "generations"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model.predict("Foo input", callbacks=[callback])
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"generations": "Foo output"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Foo input"
-    assert result["prediction"] == "Foo output"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Foo output"
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeLLM(
-        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
-    )
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    model = FakeChatModel()
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "generations"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model.predict("Foo input", callbacks=[callback])
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"generations": "Another fake response"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Human: Foo input"
-    assert result["prediction"] == "fake response"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Another fake response"
-
-
-@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
-def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
-    model = FakeChain(
-        the_input_keys=["an_input", "another_input"],
-    )
-    fake_llm = FakeChatModel()
-    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    # No input key
-    with pytest.raises(ValueError, match="multiple input keys"):
-        StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
-    with pytest.raises(ValueError, match="does not have specified"):
-        StringRunEvaluatorChain.from_model_and_evaluator(
-            model, evaluator, input_key="some_input"
-        )
-    kwargs = {}
-    if evaluator.requires_reference:
-        kwargs["reference_key"] = "label_column"
-    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
-        model, evaluator, input_key="an_input", **kwargs
-    )
-    callback = RunCollectorCallbackHandler()
-    model(
-        {"an_input": "Foo input", "another_input": "Another fake response"},
-        callbacks=[callback],
-    )
-    run = callback.traced_runs[0]
-    example = MagicMock()
-    example.inputs = {}
-    example.outputs = {"label_column": "Another fake response"}
-    result = run_evaluator._prepare_input({"run": run, "example": example})
-    assert result["input"] == "Foo input"
-    assert result["prediction"] == "baz"
-    if evaluator.requires_reference:
-        assert "reference" in result
-        assert result["reference"] == "Another fake response"
--- a/tests/unit_tests/evaluation/test_loading.py
+++ b/tests/unit_tests/evaluation/test_loading.py
@@ -1,31 +0,0 @@
-"""Test the loading function for evalutors."""
-
-import pytest
-
-from langchain.evaluation.loading import EvaluatorType, load_evaluators
-from langchain.evaluation.schema import StringEvaluator
-from tests.unit_tests.llms.fake_chat_model import FakeChatModel
-from tests.unit_tests.llms.fake_llm import FakeLLM
-
-
-@pytest.mark.parametrize("evaluator_type", EvaluatorType)
-def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeChatModel()
-    load_evaluators([evaluator_type], llm=fake_llm)
-
-    # Test as string
-    load_evaluators([evaluator_type.value], llm=fake_llm)  # type: ignore
-
-
-def test_criteria_eval_chain_requires_reference() -> None:
-    """Test loading evaluators."""
-    fake_llm = FakeLLM(
-        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
-    )
-    evaluator = load_evaluators(
-        [EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
-    )[0]
-    if not isinstance(evaluator, StringEvaluator):
-        raise ValueError("Evaluator is not a string evaluator")
-    assert evaluator.requires_reference