Add fuzzy and embedding metrics

2026-02-09 10:41:52 +00:00 · 2023-07-03 23:18:07 -07:00
3 changed files with 415 additions and 0 deletions
--- a/langchain/evaluation/comparison/embedding.py
+++ b/langchain/evaluation/comparison/embedding.py
@@ -0,0 +1,188 @@
+"""A chain for comparing the output of two models using embeddings."""
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+from pydantic import Field, root_validator
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.embeddings.base import Embeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.evaluation.schema import PairwiseStringEvaluator
+from langchain.math_utils import cosine_similarity
+
+
+class EmbeddingDistance(str, Enum):
+    COSINE = "cosine"
+    EUCLIDEAN = "euclidean"
+    MANHATTAN = "manhattan"
+    CHEBYSHEV = "chebyshev"
+    HAMMING = "hamming"
+
+
+class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
+    """A chain for comparing the output of two models using embeddings."""
+
+    embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
+    """The embedding objects to vectorize the outputs."""
+    distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
+    """The distance metric to use for comparing the embeddings."""
+
+    @root_validator
+    def _validate_distance_metric(cls, values: dict) -> dict:
+        """Validate the distance metric.
+
+        Args:
+            values (dict): The values to validate.
+
+        Returns:
+            dict: The validated values.
+        """
+        values["distance_metric"] = values["distance_metric"].lower()
+        return values
+
+    def _get_metric(self, metric: EmbeddingDistance) -> Any:
+        """Get the metric function for the given metric name.
+
+        Args:
+            metric (str): The metric name.
+
+        Returns:
+            Any: The metric function.
+        """
+        metrics = {
+            EmbeddingDistance.COSINE: self._cosine_distance,
+            EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
+            EmbeddingDistance.MANHATTAN: self._manhattan_distance,
+            EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
+            EmbeddingDistance.HAMMING: self._hamming_distance,
+        }
+        if metric in metrics:
+            return metrics[metric]
+        else:
+            raise ValueError(f"Invalid metric: {metric}")
+
+    @staticmethod
+    def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+        return 1.0 - cosine_similarity(a, b)
+
+    @staticmethod
+    def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.linalg.norm(a - b)
+
+    @staticmethod
+    def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.sum(np.abs(a - b))
+
+    @staticmethod
+    def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.max(np.abs(a - b))
+
+    @staticmethod
+    def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
+        return np.mean(a != b)
+
+    def _compute_score(self, vectors: np.ndarray) -> float:
+        metric = self._get_metric(self.distance_metric)
+        score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
+        return score
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["prediction", "prediction_b"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["score"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: CallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        vectors = np.array(
+            self.embeddings.embed_documents(
+                [inputs["prediction"], inputs["prediction_b"]]
+            )
+        )
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        embedded = await self.embeddings.aembed_documents(
+            [inputs["prediction"], inputs["prediction_b"]]
+        )
+        vectors = np.array(embedded)
+        score = self._compute_score(vectors)
+        return {"score": score}
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the embedding distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the embedding distance
+
+        between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - score: The embedding distance between the two
+                    predictions.
+        """
+        return await self.acall(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
--- a/langchain/evaluation/comparison/fuzzy_match.py
+++ b/langchain/evaluation/comparison/fuzzy_match.py
@@ -0,0 +1,129 @@
+from enum import Enum
+from typing import Any, Callable, Dict, Optional
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.schema import PairwiseStringEvaluator
+
+
+def _load_rapidfuzz() -> Any:
+    try:
+        import rapidfuzz
+    except ImportError:
+        raise ImportError(
+            "Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
+        )
+    return rapidfuzz.distance
+
+
+class StringDistance(str, Enum):
+    DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
+    LEVENSHTEIN = "levenshtein"
+    JARO = "jaro"
+    JARO_WINKLER = "jaro_winkler"
+
+
+class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
+    def __init__(
+        self,
+        distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
+    ) -> None:
+        self.metric = self._get_metric(distance)
+
+    @staticmethod
+    def _get_metric(distance: str) -> Callable:
+        rf_distance = _load_rapidfuzz()
+        if distance == StringDistance.DAMERAU_LEVENSHTEIN:
+            return rf_distance.DamerauLevenshtein.distance
+        elif distance == StringDistance.LEVENSHTEIN:
+            return rf_distance.Levenshtein.distance
+        elif distance == StringDistance.JARO:
+            return rf_distance.Jaro.distance
+        elif distance == StringDistance.JARO_WINKLER:
+            return rf_distance.JaroWinkler.distance
+        else:
+            raise ValueError(f"Invalid distance metric: {distance}")
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: CallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
+
+    def evaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - value: The preference value, which is either 'A', 'B', or None
+                    for no preference.
+                - score: The preference score, which is 1 for 'A', 0 for 'B',
+                    and 0.5 for None.
+        """
+        return self(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
+
+    async def aevaluate_string_pairs(
+        self,
+        *,
+        prediction: str,
+        prediction_b: str,
+        input: Optional[str] = None,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate the string distance between two predictions.
+
+        Args:
+            prediction (str): The output string from the first model.
+            prediction_b (str): The output string from the second model.
+            input (str): The input or task string.
+            callbacks (Callbacks, optional): The callbacks to use.
+            reference (str, optional): The reference string, if any.
+            **kwargs (Any): Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing:
+                - reasoning: The reasoning for the preference.
+                - value: The preference value, which is either 'A', 'B', or None
+                    for no preference.
+                - score: The preference score, which is 1 for 'A', 0 for 'B',
+                    and 0.5 for None.
+        """
+        return await self.acll(
+            inputs={"prediction": prediction, "prediction_b": prediction_b},
+            callbacks=callbacks,
+        )
--- a/tests/integration_tests/evaluation/comparison/test_embedding.py
+++ b/tests/integration_tests/evaluation/comparison/test_embedding.py
@@ -0,0 +1,98 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+
+from langchain.evaluation.comparison.embedding import (
+    EmbeddingDistance,
+    PairwiseEmbeddingStringEvalChain,
+)
+
+
+@pytest.fixture
+def vectors() -> Tuple[np.ndarray, np.ndarray]:
+    """Create two random vectors."""
+    np.random.seed(0)
+    vector_a = np.random.rand(10)
+    vector_b = np.random.rand(10)
+    return vector_a, vector_b
+
+
+@pytest.fixture
+def chain() -> PairwiseEmbeddingStringEvalChain:
+    """Create a PairwiseEmbeddingStringEvalChain."""
+    return PairwiseEmbeddingStringEvalChain()
+
+
+@pytest.mark.requires("scipy")
+def test_cosine_similarity(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the cosine similarity."""
+    chain.distance_metric = EmbeddingDistance.COSINE
+    result = chain._compute_score(np.array(vectors))
+    expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
+        np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
+    )
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_euclidean_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the euclidean distance."""
+    from scipy.spatial.distance import euclidean
+
+    chain.distance_metric = EmbeddingDistance.EUCLIDEAN
+    result = chain._compute_score(np.array(vectors))
+    expected = euclidean(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_manhattan_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the manhattan distance."""
+    from scipy.spatial.distance import cityblock
+
+    chain.distance_metric = EmbeddingDistance.MANHATTAN
+    result = chain._compute_score(np.array(vectors))
+    expected = cityblock(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_chebyshev_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the chebyshev distance."""
+    from scipy.spatial.distance import chebyshev
+
+    chain.distance_metric = EmbeddingDistance.CHEBYSHEV
+    result = chain._compute_score(np.array(vectors))
+    expected = chebyshev(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("scipy")
+def test_hamming_distance(
+    chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
+) -> None:
+    """Test the hamming distance."""
+    from scipy.spatial.distance import hamming
+
+    chain.distance_metric = EmbeddingDistance.HAMMING
+    result = chain._compute_score(np.array(vectors))
+    expected = hamming(*vectors)
+    assert np.isclose(result, expected)
+
+
+@pytest.mark.requires("openai", "tiktoken")
+def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
+    """Test the embedding distance."""
+    result = chain.evaluate_string_pairs(
+        prediction="A single cat", prediction_b="A single cat"
+    )
+    assert np.isclose(result["score"], 0.0)