Compare commits

...

1 Commits

Author SHA1 Message Date
William Fu-Hinthorn
f3f89e0535 Add fuzzy and embedding metrics 2023-07-03 23:18:07 -07:00
3 changed files with 415 additions and 0 deletions

View File

@@ -0,0 +1,188 @@
"""A chain for comparing the output of two models using embeddings."""
from enum import Enum
from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation.schema import PairwiseStringEvaluator
from langchain.math_utils import cosine_similarity
class EmbeddingDistance(str, Enum):
COSINE = "cosine"
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"
CHEBYSHEV = "chebyshev"
HAMMING = "hamming"
class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
"""A chain for comparing the output of two models using embeddings."""
embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
"""The embedding objects to vectorize the outputs."""
distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
"""The distance metric to use for comparing the embeddings."""
@root_validator
def _validate_distance_metric(cls, values: dict) -> dict:
"""Validate the distance metric.
Args:
values (dict): The values to validate.
Returns:
dict: The validated values.
"""
values["distance_metric"] = values["distance_metric"].lower()
return values
def _get_metric(self, metric: EmbeddingDistance) -> Any:
"""Get the metric function for the given metric name.
Args:
metric (str): The metric name.
Returns:
Any: The metric function.
"""
metrics = {
EmbeddingDistance.COSINE: self._cosine_distance,
EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
EmbeddingDistance.MANHATTAN: self._manhattan_distance,
EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
EmbeddingDistance.HAMMING: self._hamming_distance,
}
if metric in metrics:
return metrics[metric]
else:
raise ValueError(f"Invalid metric: {metric}")
@staticmethod
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
return 1.0 - cosine_similarity(a, b)
@staticmethod
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.linalg.norm(a - b)
@staticmethod
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.sum(np.abs(a - b))
@staticmethod
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.max(np.abs(a - b))
@staticmethod
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.mean(a != b)
def _compute_score(self, vectors: np.ndarray) -> float:
metric = self._get_metric(self.distance_metric)
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
return score
@property
def input_keys(self) -> List[str]:
return ["prediction", "prediction_b"]
@property
def output_keys(self) -> List[str]:
return ["score"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: CallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)
async def aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance
between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)

View File

@@ -0,0 +1,129 @@
from enum import Enum
from typing import Any, Callable, Dict, Optional
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import PairwiseStringEvaluator
def _load_rapidfuzz() -> Any:
try:
import rapidfuzz
except ImportError:
raise ImportError(
"Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
)
return rapidfuzz.distance
class StringDistance(str, Enum):
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
LEVENSHTEIN = "levenshtein"
JARO = "jaro"
JARO_WINKLER = "jaro_winkler"
class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
def __init__(
self,
distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
) -> None:
self.metric = self._get_metric(distance)
@staticmethod
def _get_metric(distance: str) -> Callable:
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
def _call(
self,
inputs: Dict[str, Any],
run_manager: CallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
def evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the string distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
return self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)
async def aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the string distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
return await self.acll(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)

View File

@@ -0,0 +1,98 @@
from typing import Tuple
import numpy as np
import pytest
from langchain.evaluation.comparison.embedding import (
EmbeddingDistance,
PairwiseEmbeddingStringEvalChain,
)
@pytest.fixture
def vectors() -> Tuple[np.ndarray, np.ndarray]:
"""Create two random vectors."""
np.random.seed(0)
vector_a = np.random.rand(10)
vector_b = np.random.rand(10)
return vector_a, vector_b
@pytest.fixture
def chain() -> PairwiseEmbeddingStringEvalChain:
"""Create a PairwiseEmbeddingStringEvalChain."""
return PairwiseEmbeddingStringEvalChain()
@pytest.mark.requires("scipy")
def test_cosine_similarity(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the cosine similarity."""
chain.distance_metric = EmbeddingDistance.COSINE
result = chain._compute_score(np.array(vectors))
expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_euclidean_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the euclidean distance."""
from scipy.spatial.distance import euclidean
chain.distance_metric = EmbeddingDistance.EUCLIDEAN
result = chain._compute_score(np.array(vectors))
expected = euclidean(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_manhattan_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the manhattan distance."""
from scipy.spatial.distance import cityblock
chain.distance_metric = EmbeddingDistance.MANHATTAN
result = chain._compute_score(np.array(vectors))
expected = cityblock(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_chebyshev_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the chebyshev distance."""
from scipy.spatial.distance import chebyshev
chain.distance_metric = EmbeddingDistance.CHEBYSHEV
result = chain._compute_score(np.array(vectors))
expected = chebyshev(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_hamming_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the hamming distance."""
from scipy.spatial.distance import hamming
chain.distance_metric = EmbeddingDistance.HAMMING
result = chain._compute_score(np.array(vectors))
expected = hamming(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("openai", "tiktoken")
def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
"""Test the embedding distance."""
result = chain.evaluate_string_pairs(
prediction="A single cat", prediction_b="A single cat"
)
assert np.isclose(result["score"], 0.0)