Warn if reference passed but evaluator doesn't require it

2026-02-08 02:00:06 +00:00 · 2023-07-03 16:44:22 -07:00
5 changed files with 38 additions and 420 deletions
--- a/langchain/evaluation/comparison/embedding.py
+++ b/langchain/evaluation/comparison/embedding.py
@@ -1,188 +0,0 @@
-"""A chain for comparing the output of two models using embeddings."""
-from enum import Enum
-from typing import Any, Dict, List, Optional
-
-import numpy as np
-from pydantic import Field, root_validator
-
-from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
-    CallbackManagerForChainRun,
-    Callbacks,
-)
-from langchain.chains.base import Chain
-from langchain.embeddings.base import Embeddings
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.evaluation.schema import PairwiseStringEvaluator
-from langchain.math_utils import cosine_similarity
-
-
-class EmbeddingDistance(str, Enum):
-    COSINE = "cosine"
-    EUCLIDEAN = "euclidean"
-    MANHATTAN = "manhattan"
-    CHEBYSHEV = "chebyshev"
-    HAMMING = "hamming"
-
-
-class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
-    """A chain for comparing the output of two models using embeddings."""
-
-    embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
-    """The embedding objects to vectorize the outputs."""
-    distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
-    """The distance metric to use for comparing the embeddings."""
-
-    @root_validator
-    def _validate_distance_metric(cls, values: dict) -> dict:
-        """Validate the distance metric.
-
-        Args:
-            values (dict): The values to validate.
-
-        Returns:
-            dict: The validated values.
-        """
-        values["distance_metric"] = values["distance_metric"].lower()
-        return values
-
-    def _get_metric(self, metric: EmbeddingDistance) -> Any:
-        """Get the metric function for the given metric name.
-
-        Args:
-            metric (str): The metric name.
-
-        Returns:
-            Any: The metric function.
-        """
-        metrics = {
-            EmbeddingDistance.COSINE: self._cosine_distance,
-            EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
-            EmbeddingDistance.MANHATTAN: self._manhattan_distance,
-            EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
-            EmbeddingDistance.HAMMING: self._hamming_distance,
-        }
-        if metric in metrics:
-            return metrics[metric]
-        else:
-            raise ValueError(f"Invalid metric: {metric}")
-
-    @staticmethod
-    def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-        return 1.0 - cosine_similarity(a, b)
-
-    @staticmethod
-    def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
-        return np.linalg.norm(a - b)
-
-    @staticmethod
-    def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
-        return np.sum(np.abs(a - b))
-
-    @staticmethod
-    def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
-        return np.max(np.abs(a - b))
-
-    @staticmethod
-    def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
-        return np.mean(a != b)
-
-    def _compute_score(self, vectors: np.ndarray) -> float:
-        metric = self._get_metric(self.distance_metric)
-        score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
-        return score
-
-    @property
-    def input_keys(self) -> List[str]:
-        return ["prediction", "prediction_b"]
-
-    @property
-    def output_keys(self) -> List[str]:
-        return ["score"]
-
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: CallbackManagerForChainRun | None = None,
-    ) -> Dict[str, Any]:
-        vectors = np.array(
-            self.embeddings.embed_documents(
-                [inputs["prediction"], inputs["prediction_b"]]
-            )
-        )
-        score = self._compute_score(vectors)
-        return {"score": score}
-
-    async def _acall(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: AsyncCallbackManagerForChainRun | None = None,
-    ) -> Dict[str, Any]:
-        embedded = await self.embeddings.aembed_documents(
-            [inputs["prediction"], inputs["prediction_b"]]
-        )
-        vectors = np.array(embedded)
-        score = self._compute_score(vectors)
-        return {"score": score}
-
-    def evaluate_string_pairs(
-        self,
-        *,
-        prediction: str,
-        prediction_b: str,
-        input: Optional[str] = None,
-        reference: Optional[str] = None,
-        callbacks: Callbacks = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Evaluate the embedding distance between two predictions.
-
-        Args:
-            prediction (str): The output string from the first model.
-            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
-            callbacks (Callbacks, optional): The callbacks to use.
-            reference (str, optional): The reference string, if any.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            dict: A dictionary containing:
-                - score: The embedding distance between the two
-                    predictions.
-        """
-        return self(
-            inputs={"prediction": prediction, "prediction_b": prediction_b},
-            callbacks=callbacks,
-        )
-
-    async def aevaluate_string_pairs(
-        self,
-        *,
-        prediction: str,
-        prediction_b: str,
-        input: Optional[str] = None,
-        reference: Optional[str] = None,
-        callbacks: Callbacks = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Asynchronously evaluate the embedding distance
-
-        between two predictions.
-
-        Args:
-            prediction (str): The output string from the first model.
-            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
-            callbacks (Callbacks, optional): The callbacks to use.
-            reference (str, optional): The reference string, if any.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            dict: A dictionary containing:
-                - score: The embedding distance between the two
-                    predictions.
-        """
-        return await self.acall(
-            inputs={"prediction": prediction, "prediction_b": prediction_b},
-            callbacks=callbacks,
-        )
--- a/langchain/evaluation/comparison/eval_chain.py
+++ b/langchain/evaluation/comparison/eval_chain.py
@@ -1,6 +1,8 @@
 """Base classes for comparing the output of two models."""
 from __future__ import annotations

+import logging
+from functools import lru_cache
 from typing import Any, Optional

 from pydantic import Field
@@ -12,6 +14,13 @@ from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
 from langchain.prompts.prompt import PromptTemplate
 from langchain.schema import BaseOutputParser

+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=1)
+def warn_once(message: str) -> None:
+    logger.warning(message)
+

 class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
    """A parser for the output of the PairwiseStringEvalChain."""
@@ -86,7 +95,7 @@ class PairwiseStringEvalChain(LLMChain):
        *,
        llm: BaseLanguageModel,
        prompt: Optional[PromptTemplate] = None,
-        require_reference: bool = False,
+        requires_reference: bool = False,
        **kwargs: Any,
    ) -> PairwiseStringEvalChain:
        """Initialize the PairwiseStringEvalChain from an LLM.
@@ -94,7 +103,7 @@ class PairwiseStringEvalChain(LLMChain):
        Args:
            llm (BaseLanguageModel): The LLM to use.
            prompt (PromptTemplate, optional): The prompt to use.
-            require_reference (bool, optional): Whether to require a reference
+            requires_reference (bool, optional): Whether to require a reference
                string. Defaults to False.
            **kwargs (Any): Additional keyword arguments.

@@ -103,13 +112,13 @@ class PairwiseStringEvalChain(LLMChain):
        """
        expected_input_vars = {"prediction", "prediction_b", "input"}
        if prompt is None:
-            if require_reference:
+            if requires_reference:
                expected_input_vars.add("reference")
                prompt_ = PROMPT_WITH_REFERENCE
            else:
                prompt_ = PROMPT
        else:
-            if require_reference:
+            if requires_reference:
                expected_input_vars.add("reference")
            prompt_ = prompt

@@ -128,8 +137,18 @@ class PairwiseStringEvalChain(LLMChain):
            "prediction_b": prediction_b,
            "input": input,
        }
-        if reference is not None and "reference" in self.prompt.input_variables:
+        if "reference" in self.prompt.input_variables:
+            if reference is None:
+                raise ValueError(
+                    "Prompt requires a reference string, but none was provided."
+                )
            input_["reference"] = reference
+        elif reference is not None:
+            warn_once(
+                "Ignoring reference string in PairwiseStringEvalChain."
+                " To use references, initialize with argument `requires_reference=True`"
+                ' or use a prompt that included "reference" as an input variable.'
+            )
        return input_

    def evaluate_string_pairs(
--- a/langchain/evaluation/comparison/fuzzy_match.py
+++ b/langchain/evaluation/comparison/fuzzy_match.py
@@ -1,129 +0,0 @@
-from enum import Enum
-from typing import Any, Callable, Dict, Optional
-
-from langchain.callbacks.manager import (
-    AsyncCallbackManagerForChainRun,
-    CallbackManagerForChainRun,
-    Callbacks,
-)
-from langchain.chains.base import Chain
-from langchain.evaluation.schema import PairwiseStringEvaluator
-
-
-def _load_rapidfuzz() -> Any:
-    try:
-        import rapidfuzz
-    except ImportError:
-        raise ImportError(
-            "Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
-        )
-    return rapidfuzz.distance
-
-
-class StringDistance(str, Enum):
-    DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
-    LEVENSHTEIN = "levenshtein"
-    JARO = "jaro"
-    JARO_WINKLER = "jaro_winkler"
-
-
-class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
-    def __init__(
-        self,
-        distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
-    ) -> None:
-        self.metric = self._get_metric(distance)
-
-    @staticmethod
-    def _get_metric(distance: str) -> Callable:
-        rf_distance = _load_rapidfuzz()
-        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
-            return rf_distance.DamerauLevenshtein.distance
-        elif distance == StringDistance.LEVENSHTEIN:
-            return rf_distance.Levenshtein.distance
-        elif distance == StringDistance.JARO:
-            return rf_distance.Jaro.distance
-        elif distance == StringDistance.JARO_WINKLER:
-            return rf_distance.JaroWinkler.distance
-        else:
-            raise ValueError(f"Invalid distance metric: {distance}")
-
-    def _call(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: CallbackManagerForChainRun | None = None,
-    ) -> Dict[str, Any]:
-        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
-
-    async def _acall(
-        self,
-        inputs: Dict[str, Any],
-        run_manager: AsyncCallbackManagerForChainRun | None = None,
-    ) -> Dict[str, Any]:
-        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
-
-    def evaluate_string_pairs(
-        self,
-        *,
-        prediction: str,
-        prediction_b: str,
-        input: Optional[str] = None,
-        reference: Optional[str] = None,
-        callbacks: Callbacks = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Evaluate the string distance between two predictions.
-
-        Args:
-            prediction (str): The output string from the first model.
-            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
-            callbacks (Callbacks, optional): The callbacks to use.
-            reference (str, optional): The reference string, if any.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            dict: A dictionary containing:
-                - reasoning: The reasoning for the preference.
-                - value: The preference value, which is either 'A', 'B', or None
-                    for no preference.
-                - score: The preference score, which is 1 for 'A', 0 for 'B',
-                    and 0.5 for None.
-        """
-        return self(
-            inputs={"prediction": prediction, "prediction_b": prediction_b},
-            callbacks=callbacks,
-        )
-
-    async def aevaluate_string_pairs(
-        self,
-        *,
-        prediction: str,
-        prediction_b: str,
-        input: Optional[str] = None,
-        reference: Optional[str] = None,
-        callbacks: Callbacks = None,
-        **kwargs: Any,
-    ) -> dict:
-        """Asynchronously evaluate the string distance between two predictions.
-
-        Args:
-            prediction (str): The output string from the first model.
-            prediction_b (str): The output string from the second model.
-            input (str): The input or task string.
-            callbacks (Callbacks, optional): The callbacks to use.
-            reference (str, optional): The reference string, if any.
-            **kwargs (Any): Additional keyword arguments.
-
-        Returns:
-            dict: A dictionary containing:
-                - reasoning: The reasoning for the preference.
-                - value: The preference value, which is either 'A', 'B', or None
-                    for no preference.
-                - score: The preference score, which is 1 for 'A', 0 for 'B',
-                    and 0.5 for None.
-        """
-        return await self.acll(
-            inputs={"prediction": prediction, "prediction_b": prediction_b},
-            callbacks=callbacks,
-        )
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@@ -1,5 +1,7 @@
 from __future__ import annotations

+import logging
+from functools import lru_cache
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Union

 from pydantic import Field
@@ -10,6 +12,8 @@ from langchain.chains.llm import LLMChain
 from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
 from langchain.schema import BaseOutputParser, BasePromptTemplate

+logger = logging.getLogger(__name__)
+
 _SUPPORTED_CRITERIA = {
    "conciseness": "Is the submission concise and to the point?",
    "relevance": "Is the submission referring to a real quote from the text?",
@@ -25,6 +29,11 @@ _SUPPORTED_CRITERIA = {
 }


+@lru_cache(maxsize=1)
+def warn_once(message: str) -> None:
+    logger.warning(message)
+
+
 class CriteriaResultOutputParser(BaseOutputParser[dict]):
    """A parser for the output of the CriteriaEvalChain."""

@@ -250,6 +259,11 @@ class CriteriaEvalChain(LLMChain):
        }
        if self.requires_reference:
            input_["reference"] = reference
+        elif reference is not None:
+            warn_once(
+                "The reference text will be ignored because this Criteria evaluator"
+                " does not require a reference."
+            )
        return input_

    def evaluate_strings(
--- a/tests/integration_tests/evaluation/comparison/test_embedding.py
+++ b/tests/integration_tests/evaluation/comparison/test_embedding.py
@@ -1,98 +0,0 @@
-from typing import Tuple
-
-import numpy as np
-import pytest
-
-from langchain.evaluation.comparison.embedding import (
-    EmbeddingDistance,
-    PairwiseEmbeddingStringEvalChain,
-)
-
-
-@pytest.fixture
-def vectors() -> Tuple[np.ndarray, np.ndarray]:
-    """Create two random vectors."""
-    np.random.seed(0)
-    vector_a = np.random.rand(10)
-    vector_b = np.random.rand(10)
-    return vector_a, vector_b
-
-
-@pytest.fixture
-def chain() -> PairwiseEmbeddingStringEvalChain:
-    """Create a PairwiseEmbeddingStringEvalChain."""
-    return PairwiseEmbeddingStringEvalChain()
-
-
-@pytest.mark.requires("scipy")
-def test_cosine_similarity(
-    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
-) -> None:
-    """Test the cosine similarity."""
-    chain.distance_metric = EmbeddingDistance.COSINE
-    result = chain._compute_score(np.array(vectors))
-    expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
-        np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
-    )
-    assert np.isclose(result, expected)
-
-
-@pytest.mark.requires("scipy")
-def test_euclidean_distance(
-    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
-) -> None:
-    """Test the euclidean distance."""
-    from scipy.spatial.distance import euclidean
-
-    chain.distance_metric = EmbeddingDistance.EUCLIDEAN
-    result = chain._compute_score(np.array(vectors))
-    expected = euclidean(*vectors)
-    assert np.isclose(result, expected)
-
-
-@pytest.mark.requires("scipy")
-def test_manhattan_distance(
-    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
-) -> None:
-    """Test the manhattan distance."""
-    from scipy.spatial.distance import cityblock
-
-    chain.distance_metric = EmbeddingDistance.MANHATTAN
-    result = chain._compute_score(np.array(vectors))
-    expected = cityblock(*vectors)
-    assert np.isclose(result, expected)
-
-
-@pytest.mark.requires("scipy")
-def test_chebyshev_distance(
-    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
-) -> None:
-    """Test the chebyshev distance."""
-    from scipy.spatial.distance import chebyshev
-
-    chain.distance_metric = EmbeddingDistance.CHEBYSHEV
-    result = chain._compute_score(np.array(vectors))
-    expected = chebyshev(*vectors)
-    assert np.isclose(result, expected)
-
-
-@pytest.mark.requires("scipy")
-def test_hamming_distance(
-    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
-) -> None:
-    """Test the hamming distance."""
-    from scipy.spatial.distance import hamming
-
-    chain.distance_metric = EmbeddingDistance.HAMMING
-    result = chain._compute_score(np.array(vectors))
-    expected = hamming(*vectors)
-    assert np.isclose(result, expected)
-
-
-@pytest.mark.requires("openai", "tiktoken")
-def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
-    """Test the embedding distance."""
-    result = chain.evaluate_string_pairs(
-        prediction="A single cat", prediction_b="A single cat"
-    )
-    assert np.isclose(result["score"], 0.0)