Compare commits

..

1 Commits

Author SHA1 Message Date
William Fu-Hinthorn
75fd543b33 Warn if reference passed but evaluator doesn't require it 2023-07-03 16:44:22 -07:00
5 changed files with 38 additions and 420 deletions

View File

@@ -1,188 +0,0 @@
"""A chain for comparing the output of two models using embeddings."""
from enum import Enum
from typing import Any, Dict, List, Optional
import numpy as np
from pydantic import Field, root_validator
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.embeddings.base import Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.evaluation.schema import PairwiseStringEvaluator
from langchain.math_utils import cosine_similarity
class EmbeddingDistance(str, Enum):
COSINE = "cosine"
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"
CHEBYSHEV = "chebyshev"
HAMMING = "hamming"
class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
"""A chain for comparing the output of two models using embeddings."""
embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
"""The embedding objects to vectorize the outputs."""
distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
"""The distance metric to use for comparing the embeddings."""
@root_validator
def _validate_distance_metric(cls, values: dict) -> dict:
"""Validate the distance metric.
Args:
values (dict): The values to validate.
Returns:
dict: The validated values.
"""
values["distance_metric"] = values["distance_metric"].lower()
return values
def _get_metric(self, metric: EmbeddingDistance) -> Any:
"""Get the metric function for the given metric name.
Args:
metric (str): The metric name.
Returns:
Any: The metric function.
"""
metrics = {
EmbeddingDistance.COSINE: self._cosine_distance,
EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
EmbeddingDistance.MANHATTAN: self._manhattan_distance,
EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
EmbeddingDistance.HAMMING: self._hamming_distance,
}
if metric in metrics:
return metrics[metric]
else:
raise ValueError(f"Invalid metric: {metric}")
@staticmethod
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
return 1.0 - cosine_similarity(a, b)
@staticmethod
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.linalg.norm(a - b)
@staticmethod
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.sum(np.abs(a - b))
@staticmethod
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.max(np.abs(a - b))
@staticmethod
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
return np.mean(a != b)
def _compute_score(self, vectors: np.ndarray) -> float:
metric = self._get_metric(self.distance_metric)
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
return score
@property
def input_keys(self) -> List[str]:
return ["prediction", "prediction_b"]
@property
def output_keys(self) -> List[str]:
return ["score"]
def _call(
self,
inputs: Dict[str, Any],
run_manager: CallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
vectors = np.array(
self.embeddings.embed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
)
score = self._compute_score(vectors)
return {"score": score}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
embedded = await self.embeddings.aembed_documents(
[inputs["prediction"], inputs["prediction_b"]]
)
vectors = np.array(embedded)
score = self._compute_score(vectors)
return {"score": score}
def evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the embedding distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)
async def aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the embedding distance
between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- score: The embedding distance between the two
predictions.
"""
return await self.acall(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)

View File

@@ -1,6 +1,8 @@
"""Base classes for comparing the output of two models."""
from __future__ import annotations
import logging
from functools import lru_cache
from typing import Any, Optional
from pydantic import Field
@@ -12,6 +14,13 @@ from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import BaseOutputParser
logger = logging.getLogger(__name__)
@lru_cache(maxsize=1)
def warn_once(message: str) -> None:
logger.warning(message)
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the PairwiseStringEvalChain."""
@@ -86,7 +95,7 @@ class PairwiseStringEvalChain(LLMChain):
*,
llm: BaseLanguageModel,
prompt: Optional[PromptTemplate] = None,
require_reference: bool = False,
requires_reference: bool = False,
**kwargs: Any,
) -> PairwiseStringEvalChain:
"""Initialize the PairwiseStringEvalChain from an LLM.
@@ -94,7 +103,7 @@ class PairwiseStringEvalChain(LLMChain):
Args:
llm (BaseLanguageModel): The LLM to use.
prompt (PromptTemplate, optional): The prompt to use.
require_reference (bool, optional): Whether to require a reference
requires_reference (bool, optional): Whether to require a reference
string. Defaults to False.
**kwargs (Any): Additional keyword arguments.
@@ -103,13 +112,13 @@ class PairwiseStringEvalChain(LLMChain):
"""
expected_input_vars = {"prediction", "prediction_b", "input"}
if prompt is None:
if require_reference:
if requires_reference:
expected_input_vars.add("reference")
prompt_ = PROMPT_WITH_REFERENCE
else:
prompt_ = PROMPT
else:
if require_reference:
if requires_reference:
expected_input_vars.add("reference")
prompt_ = prompt
@@ -128,8 +137,18 @@ class PairwiseStringEvalChain(LLMChain):
"prediction_b": prediction_b,
"input": input,
}
if reference is not None and "reference" in self.prompt.input_variables:
if "reference" in self.prompt.input_variables:
if reference is None:
raise ValueError(
"Prompt requires a reference string, but none was provided."
)
input_["reference"] = reference
elif reference is not None:
warn_once(
"Ignoring reference string in PairwiseStringEvalChain."
" To use references, initialize with argument `requires_reference=True`"
' or use a prompt that included "reference" as an input variable.'
)
return input_
def evaluate_string_pairs(

View File

@@ -1,129 +0,0 @@
from enum import Enum
from typing import Any, Callable, Dict, Optional
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
Callbacks,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import PairwiseStringEvaluator
def _load_rapidfuzz() -> Any:
try:
import rapidfuzz
except ImportError:
raise ImportError(
"Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
)
return rapidfuzz.distance
class StringDistance(str, Enum):
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
LEVENSHTEIN = "levenshtein"
JARO = "jaro"
JARO_WINKLER = "jaro_winkler"
class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
def __init__(
self,
distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
) -> None:
self.metric = self._get_metric(distance)
@staticmethod
def _get_metric(distance: str) -> Callable:
rf_distance = _load_rapidfuzz()
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
return rf_distance.DamerauLevenshtein.distance
elif distance == StringDistance.LEVENSHTEIN:
return rf_distance.Levenshtein.distance
elif distance == StringDistance.JARO:
return rf_distance.Jaro.distance
elif distance == StringDistance.JARO_WINKLER:
return rf_distance.JaroWinkler.distance
else:
raise ValueError(f"Invalid distance metric: {distance}")
def _call(
self,
inputs: Dict[str, Any],
run_manager: CallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
async def _acall(
self,
inputs: Dict[str, Any],
run_manager: AsyncCallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
def evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Evaluate the string distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
return self(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)
async def aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
input: Optional[str] = None,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the string distance between two predictions.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
**kwargs (Any): Additional keyword arguments.
Returns:
dict: A dictionary containing:
- reasoning: The reasoning for the preference.
- value: The preference value, which is either 'A', 'B', or None
for no preference.
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
return await self.acll(
inputs={"prediction": prediction, "prediction_b": prediction_b},
callbacks=callbacks,
)

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
import logging
from functools import lru_cache
from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
from pydantic import Field
@@ -10,6 +12,8 @@ from langchain.chains.llm import LLMChain
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
from langchain.schema import BaseOutputParser, BasePromptTemplate
logger = logging.getLogger(__name__)
_SUPPORTED_CRITERIA = {
"conciseness": "Is the submission concise and to the point?",
"relevance": "Is the submission referring to a real quote from the text?",
@@ -25,6 +29,11 @@ _SUPPORTED_CRITERIA = {
}
@lru_cache(maxsize=1)
def warn_once(message: str) -> None:
logger.warning(message)
class CriteriaResultOutputParser(BaseOutputParser[dict]):
"""A parser for the output of the CriteriaEvalChain."""
@@ -250,6 +259,11 @@ class CriteriaEvalChain(LLMChain):
}
if self.requires_reference:
input_["reference"] = reference
elif reference is not None:
warn_once(
"The reference text will be ignored because this Criteria evaluator"
" does not require a reference."
)
return input_
def evaluate_strings(

View File

@@ -1,98 +0,0 @@
from typing import Tuple
import numpy as np
import pytest
from langchain.evaluation.comparison.embedding import (
EmbeddingDistance,
PairwiseEmbeddingStringEvalChain,
)
@pytest.fixture
def vectors() -> Tuple[np.ndarray, np.ndarray]:
"""Create two random vectors."""
np.random.seed(0)
vector_a = np.random.rand(10)
vector_b = np.random.rand(10)
return vector_a, vector_b
@pytest.fixture
def chain() -> PairwiseEmbeddingStringEvalChain:
"""Create a PairwiseEmbeddingStringEvalChain."""
return PairwiseEmbeddingStringEvalChain()
@pytest.mark.requires("scipy")
def test_cosine_similarity(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the cosine similarity."""
chain.distance_metric = EmbeddingDistance.COSINE
result = chain._compute_score(np.array(vectors))
expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_euclidean_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the euclidean distance."""
from scipy.spatial.distance import euclidean
chain.distance_metric = EmbeddingDistance.EUCLIDEAN
result = chain._compute_score(np.array(vectors))
expected = euclidean(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_manhattan_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the manhattan distance."""
from scipy.spatial.distance import cityblock
chain.distance_metric = EmbeddingDistance.MANHATTAN
result = chain._compute_score(np.array(vectors))
expected = cityblock(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_chebyshev_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the chebyshev distance."""
from scipy.spatial.distance import chebyshev
chain.distance_metric = EmbeddingDistance.CHEBYSHEV
result = chain._compute_score(np.array(vectors))
expected = chebyshev(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("scipy")
def test_hamming_distance(
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
) -> None:
"""Test the hamming distance."""
from scipy.spatial.distance import hamming
chain.distance_metric = EmbeddingDistance.HAMMING
result = chain._compute_score(np.array(vectors))
expected = hamming(*vectors)
assert np.isclose(result, expected)
@pytest.mark.requires("openai", "tiktoken")
def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
"""Test the embedding distance."""
result = chain.evaluate_string_pairs(
prediction="A single cat", prediction_b="A single cat"
)
assert np.isclose(result["score"], 0.0)