mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-08 02:00:06 +00:00
Compare commits
1 Commits
vwp/embedd
...
vwp/compar
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
75fd543b33 |
@@ -1,188 +0,0 @@
|
||||
"""A chain for comparing the output of two models using embeddings."""
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
from pydantic import Field, root_validator
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
Callbacks,
|
||||
)
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.evaluation.schema import PairwiseStringEvaluator
|
||||
from langchain.math_utils import cosine_similarity
|
||||
|
||||
|
||||
class EmbeddingDistance(str, Enum):
|
||||
COSINE = "cosine"
|
||||
EUCLIDEAN = "euclidean"
|
||||
MANHATTAN = "manhattan"
|
||||
CHEBYSHEV = "chebyshev"
|
||||
HAMMING = "hamming"
|
||||
|
||||
|
||||
class PairwiseEmbeddingStringEvalChain(Chain, PairwiseStringEvaluator):
|
||||
"""A chain for comparing the output of two models using embeddings."""
|
||||
|
||||
embeddings: Embeddings = Field(default_factory=OpenAIEmbeddings)
|
||||
"""The embedding objects to vectorize the outputs."""
|
||||
distance_metric: EmbeddingDistance = Field(default=EmbeddingDistance.COSINE)
|
||||
"""The distance metric to use for comparing the embeddings."""
|
||||
|
||||
@root_validator
|
||||
def _validate_distance_metric(cls, values: dict) -> dict:
|
||||
"""Validate the distance metric.
|
||||
|
||||
Args:
|
||||
values (dict): The values to validate.
|
||||
|
||||
Returns:
|
||||
dict: The validated values.
|
||||
"""
|
||||
values["distance_metric"] = values["distance_metric"].lower()
|
||||
return values
|
||||
|
||||
def _get_metric(self, metric: EmbeddingDistance) -> Any:
|
||||
"""Get the metric function for the given metric name.
|
||||
|
||||
Args:
|
||||
metric (str): The metric name.
|
||||
|
||||
Returns:
|
||||
Any: The metric function.
|
||||
"""
|
||||
metrics = {
|
||||
EmbeddingDistance.COSINE: self._cosine_distance,
|
||||
EmbeddingDistance.EUCLIDEAN: self._euclidean_distance,
|
||||
EmbeddingDistance.MANHATTAN: self._manhattan_distance,
|
||||
EmbeddingDistance.CHEBYSHEV: self._chebyshev_distance,
|
||||
EmbeddingDistance.HAMMING: self._hamming_distance,
|
||||
}
|
||||
if metric in metrics:
|
||||
return metrics[metric]
|
||||
else:
|
||||
raise ValueError(f"Invalid metric: {metric}")
|
||||
|
||||
@staticmethod
|
||||
def _cosine_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
||||
return 1.0 - cosine_similarity(a, b)
|
||||
|
||||
@staticmethod
|
||||
def _euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
|
||||
return np.linalg.norm(a - b)
|
||||
|
||||
@staticmethod
|
||||
def _manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
|
||||
return np.sum(np.abs(a - b))
|
||||
|
||||
@staticmethod
|
||||
def _chebyshev_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
|
||||
return np.max(np.abs(a - b))
|
||||
|
||||
@staticmethod
|
||||
def _hamming_distance(a: np.ndarray, b: np.ndarray) -> np.floating:
|
||||
return np.mean(a != b)
|
||||
|
||||
def _compute_score(self, vectors: np.ndarray) -> float:
|
||||
metric = self._get_metric(self.distance_metric)
|
||||
score = metric(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1)).item()
|
||||
return score
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
return ["prediction", "prediction_b"]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
return ["score"]
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: CallbackManagerForChainRun | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
vectors = np.array(
|
||||
self.embeddings.embed_documents(
|
||||
[inputs["prediction"], inputs["prediction_b"]]
|
||||
)
|
||||
)
|
||||
score = self._compute_score(vectors)
|
||||
return {"score": score}
|
||||
|
||||
async def _acall(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: AsyncCallbackManagerForChainRun | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
embedded = await self.embeddings.aembed_documents(
|
||||
[inputs["prediction"], inputs["prediction_b"]]
|
||||
)
|
||||
vectors = np.array(embedded)
|
||||
score = self._compute_score(vectors)
|
||||
return {"score": score}
|
||||
|
||||
def evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Evaluate the embedding distance between two predictions.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- score: The embedding distance between the two
|
||||
predictions.
|
||||
"""
|
||||
return self(
|
||||
inputs={"prediction": prediction, "prediction_b": prediction_b},
|
||||
callbacks=callbacks,
|
||||
)
|
||||
|
||||
async def aevaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Asynchronously evaluate the embedding distance
|
||||
|
||||
between two predictions.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- score: The embedding distance between the two
|
||||
predictions.
|
||||
"""
|
||||
return await self.acall(
|
||||
inputs={"prediction": prediction, "prediction_b": prediction_b},
|
||||
callbacks=callbacks,
|
||||
)
|
||||
@@ -1,6 +1,8 @@
|
||||
"""Base classes for comparing the output of two models."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic import Field
|
||||
@@ -12,6 +14,13 @@ from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.schema import BaseOutputParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def warn_once(message: str) -> None:
|
||||
logger.warning(message)
|
||||
|
||||
|
||||
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the PairwiseStringEvalChain."""
|
||||
@@ -86,7 +95,7 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
*,
|
||||
llm: BaseLanguageModel,
|
||||
prompt: Optional[PromptTemplate] = None,
|
||||
require_reference: bool = False,
|
||||
requires_reference: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> PairwiseStringEvalChain:
|
||||
"""Initialize the PairwiseStringEvalChain from an LLM.
|
||||
@@ -94,7 +103,7 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
Args:
|
||||
llm (BaseLanguageModel): The LLM to use.
|
||||
prompt (PromptTemplate, optional): The prompt to use.
|
||||
require_reference (bool, optional): Whether to require a reference
|
||||
requires_reference (bool, optional): Whether to require a reference
|
||||
string. Defaults to False.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
@@ -103,13 +112,13 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
"""
|
||||
expected_input_vars = {"prediction", "prediction_b", "input"}
|
||||
if prompt is None:
|
||||
if require_reference:
|
||||
if requires_reference:
|
||||
expected_input_vars.add("reference")
|
||||
prompt_ = PROMPT_WITH_REFERENCE
|
||||
else:
|
||||
prompt_ = PROMPT
|
||||
else:
|
||||
if require_reference:
|
||||
if requires_reference:
|
||||
expected_input_vars.add("reference")
|
||||
prompt_ = prompt
|
||||
|
||||
@@ -128,8 +137,18 @@ class PairwiseStringEvalChain(LLMChain):
|
||||
"prediction_b": prediction_b,
|
||||
"input": input,
|
||||
}
|
||||
if reference is not None and "reference" in self.prompt.input_variables:
|
||||
if "reference" in self.prompt.input_variables:
|
||||
if reference is None:
|
||||
raise ValueError(
|
||||
"Prompt requires a reference string, but none was provided."
|
||||
)
|
||||
input_["reference"] = reference
|
||||
elif reference is not None:
|
||||
warn_once(
|
||||
"Ignoring reference string in PairwiseStringEvalChain."
|
||||
" To use references, initialize with argument `requires_reference=True`"
|
||||
' or use a prompt that included "reference" as an input variable.'
|
||||
)
|
||||
return input_
|
||||
|
||||
def evaluate_string_pairs(
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
Callbacks,
|
||||
)
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.evaluation.schema import PairwiseStringEvaluator
|
||||
|
||||
|
||||
def _load_rapidfuzz() -> Any:
|
||||
try:
|
||||
import rapidfuzz
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the rapidfuzz library to use the FuzzyMatchStringEvaluator."
|
||||
)
|
||||
return rapidfuzz.distance
|
||||
|
||||
|
||||
class StringDistance(str, Enum):
|
||||
DAMERAU_LEVENSHTEIN = "damerau_levenshtein"
|
||||
LEVENSHTEIN = "levenshtein"
|
||||
JARO = "jaro"
|
||||
JARO_WINKLER = "jaro_winkler"
|
||||
|
||||
|
||||
class FuzzyMatchStringEvaluator(Chain, PairwiseStringEvaluator):
|
||||
def __init__(
|
||||
self,
|
||||
distance: str = StringDistance.DAMERAU_LEVENSHTEIN,
|
||||
) -> None:
|
||||
self.metric = self._get_metric(distance)
|
||||
|
||||
@staticmethod
|
||||
def _get_metric(distance: str) -> Callable:
|
||||
rf_distance = _load_rapidfuzz()
|
||||
if distance == StringDistance.DAMERAU_LEVENSHTEIN:
|
||||
return rf_distance.DamerauLevenshtein.distance
|
||||
elif distance == StringDistance.LEVENSHTEIN:
|
||||
return rf_distance.Levenshtein.distance
|
||||
elif distance == StringDistance.JARO:
|
||||
return rf_distance.Jaro.distance
|
||||
elif distance == StringDistance.JARO_WINKLER:
|
||||
return rf_distance.JaroWinkler.distance
|
||||
else:
|
||||
raise ValueError(f"Invalid distance metric: {distance}")
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: CallbackManagerForChainRun | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
|
||||
|
||||
async def _acall(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: AsyncCallbackManagerForChainRun | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
return {"score": self.metric(inputs["prediction"], inputs["prediction_b"])}
|
||||
|
||||
def evaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Evaluate the string distance between two predictions.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- value: The preference value, which is either 'A', 'B', or None
|
||||
for no preference.
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
"""
|
||||
return self(
|
||||
inputs={"prediction": prediction, "prediction_b": prediction_b},
|
||||
callbacks=callbacks,
|
||||
)
|
||||
|
||||
async def aevaluate_string_pairs(
|
||||
self,
|
||||
*,
|
||||
prediction: str,
|
||||
prediction_b: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> dict:
|
||||
"""Asynchronously evaluate the string distance between two predictions.
|
||||
|
||||
Args:
|
||||
prediction (str): The output string from the first model.
|
||||
prediction_b (str): The output string from the second model.
|
||||
input (str): The input or task string.
|
||||
callbacks (Callbacks, optional): The callbacks to use.
|
||||
reference (str, optional): The reference string, if any.
|
||||
**kwargs (Any): Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing:
|
||||
- reasoning: The reasoning for the preference.
|
||||
- value: The preference value, which is either 'A', 'B', or None
|
||||
for no preference.
|
||||
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||
and 0.5 for None.
|
||||
"""
|
||||
return await self.acll(
|
||||
inputs={"prediction": prediction, "prediction_b": prediction_b},
|
||||
callbacks=callbacks,
|
||||
)
|
||||
@@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
|
||||
|
||||
from pydantic import Field
|
||||
@@ -10,6 +12,8 @@ from langchain.chains.llm import LLMChain
|
||||
from langchain.evaluation.criteria.prompt import PROMPT, PROMPT_WITH_REFERENCES
|
||||
from langchain.schema import BaseOutputParser, BasePromptTemplate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SUPPORTED_CRITERIA = {
|
||||
"conciseness": "Is the submission concise and to the point?",
|
||||
"relevance": "Is the submission referring to a real quote from the text?",
|
||||
@@ -25,6 +29,11 @@ _SUPPORTED_CRITERIA = {
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def warn_once(message: str) -> None:
|
||||
logger.warning(message)
|
||||
|
||||
|
||||
class CriteriaResultOutputParser(BaseOutputParser[dict]):
|
||||
"""A parser for the output of the CriteriaEvalChain."""
|
||||
|
||||
@@ -250,6 +259,11 @@ class CriteriaEvalChain(LLMChain):
|
||||
}
|
||||
if self.requires_reference:
|
||||
input_["reference"] = reference
|
||||
elif reference is not None:
|
||||
warn_once(
|
||||
"The reference text will be ignored because this Criteria evaluator"
|
||||
" does not require a reference."
|
||||
)
|
||||
return input_
|
||||
|
||||
def evaluate_strings(
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from langchain.evaluation.comparison.embedding import (
|
||||
EmbeddingDistance,
|
||||
PairwiseEmbeddingStringEvalChain,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vectors() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Create two random vectors."""
|
||||
np.random.seed(0)
|
||||
vector_a = np.random.rand(10)
|
||||
vector_b = np.random.rand(10)
|
||||
return vector_a, vector_b
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def chain() -> PairwiseEmbeddingStringEvalChain:
|
||||
"""Create a PairwiseEmbeddingStringEvalChain."""
|
||||
return PairwiseEmbeddingStringEvalChain()
|
||||
|
||||
|
||||
@pytest.mark.requires("scipy")
|
||||
def test_cosine_similarity(
|
||||
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
|
||||
) -> None:
|
||||
"""Test the cosine similarity."""
|
||||
chain.distance_metric = EmbeddingDistance.COSINE
|
||||
result = chain._compute_score(np.array(vectors))
|
||||
expected = 1.0 - np.dot(vectors[0], vectors[1]) / (
|
||||
np.linalg.norm(vectors[0]) * np.linalg.norm(vectors[1])
|
||||
)
|
||||
assert np.isclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.requires("scipy")
|
||||
def test_euclidean_distance(
|
||||
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
|
||||
) -> None:
|
||||
"""Test the euclidean distance."""
|
||||
from scipy.spatial.distance import euclidean
|
||||
|
||||
chain.distance_metric = EmbeddingDistance.EUCLIDEAN
|
||||
result = chain._compute_score(np.array(vectors))
|
||||
expected = euclidean(*vectors)
|
||||
assert np.isclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.requires("scipy")
|
||||
def test_manhattan_distance(
|
||||
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
|
||||
) -> None:
|
||||
"""Test the manhattan distance."""
|
||||
from scipy.spatial.distance import cityblock
|
||||
|
||||
chain.distance_metric = EmbeddingDistance.MANHATTAN
|
||||
result = chain._compute_score(np.array(vectors))
|
||||
expected = cityblock(*vectors)
|
||||
assert np.isclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.requires("scipy")
|
||||
def test_chebyshev_distance(
|
||||
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
|
||||
) -> None:
|
||||
"""Test the chebyshev distance."""
|
||||
from scipy.spatial.distance import chebyshev
|
||||
|
||||
chain.distance_metric = EmbeddingDistance.CHEBYSHEV
|
||||
result = chain._compute_score(np.array(vectors))
|
||||
expected = chebyshev(*vectors)
|
||||
assert np.isclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.requires("scipy")
|
||||
def test_hamming_distance(
|
||||
chain: PairwiseEmbeddingStringEvalChain, vectors: Tuple[np.ndarray, np.ndarray]
|
||||
) -> None:
|
||||
"""Test the hamming distance."""
|
||||
from scipy.spatial.distance import hamming
|
||||
|
||||
chain.distance_metric = EmbeddingDistance.HAMMING
|
||||
result = chain._compute_score(np.array(vectors))
|
||||
expected = hamming(*vectors)
|
||||
assert np.isclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.requires("openai", "tiktoken")
|
||||
def test_embedding_distance(chain: PairwiseEmbeddingStringEvalChain) -> None:
|
||||
"""Test the embedding distance."""
|
||||
result = chain.evaluate_string_pairs(
|
||||
prediction="A single cat", prediction_b="A single cat"
|
||||
)
|
||||
assert np.isclose(result["score"], 0.0)
|
||||
Reference in New Issue
Block a user