mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 08:56:27 +00:00
fix[core]: added error message if the query vector or embedding contains NaN values (#31822)
**Description:** Added an explicit validation step in `langchain_core.vectorstores.utils._cosine_similarity` to raise a `ValueError` if the input query or any embedding contains `NaN` values. This prevents silent failures or unstable behavior during similarity calculations, especially when using maximal_marginal_relevance. **Issue**: Fixes #31806 **Dependencies:** None --------- Co-authored-by: Azhagammal S C <azhagammal@kofluence.com> Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
parent
a8998a1f57
commit
4d9c0b0883
@ -64,6 +64,9 @@ def _cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
|
|||||||
# Ignore divide by zero errors run time warnings as those are handled below.
|
# Ignore divide by zero errors run time warnings as those are handled below.
|
||||||
with np.errstate(divide="ignore", invalid="ignore"):
|
with np.errstate(divide="ignore", invalid="ignore"):
|
||||||
similarity = np.dot(x, y.T) / np.outer(x_norm, y_norm)
|
similarity = np.dot(x, y.T) / np.outer(x_norm, y_norm)
|
||||||
|
if np.isnan(similarity).all():
|
||||||
|
msg = "NaN values found, please remove the NaN values and try again"
|
||||||
|
raise ValueError(msg) from None
|
||||||
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
||||||
return similarity
|
return similarity
|
||||||
|
|
||||||
|
@ -42,9 +42,8 @@ class TestCosineSimilarity:
|
|||||||
"""Test cosine similarity with zero vector."""
|
"""Test cosine similarity with zero vector."""
|
||||||
x: list[list[float]] = [[0, 0, 0]]
|
x: list[list[float]] = [[0, 0, 0]]
|
||||||
y: list[list[float]] = [[1, 2, 3]]
|
y: list[list[float]] = [[1, 2, 3]]
|
||||||
result = _cosine_similarity(x, y)
|
with pytest.raises(ValueError, match="NaN values found"):
|
||||||
expected = np.array([[0.0]])
|
_cosine_similarity(x, y)
|
||||||
np.testing.assert_array_almost_equal(result, expected)
|
|
||||||
|
|
||||||
def test_multiple_vectors(self) -> None:
|
def test_multiple_vectors(self) -> None:
|
||||||
"""Test cosine similarity with multiple vectors."""
|
"""Test cosine similarity with multiple vectors."""
|
||||||
@ -115,13 +114,8 @@ class TestCosineSimilarity:
|
|||||||
# Create vectors that would result in NaN/inf in similarity calculation
|
# Create vectors that would result in NaN/inf in similarity calculation
|
||||||
x: list[list[float]] = [[0, 0]] # Zero vector
|
x: list[list[float]] = [[0, 0]] # Zero vector
|
||||||
y: list[list[float]] = [[0, 0]] # Zero vector
|
y: list[list[float]] = [[0, 0]] # Zero vector
|
||||||
result = _cosine_similarity(x, y)
|
with pytest.raises(ValueError, match="NaN values found"):
|
||||||
|
_cosine_similarity(x, y)
|
||||||
# Should return 0.0 instead of NaN
|
|
||||||
expected = np.array([[0.0]])
|
|
||||||
np.testing.assert_array_equal(result, expected)
|
|
||||||
assert not np.isnan(result).any()
|
|
||||||
assert not np.isinf(result).any()
|
|
||||||
|
|
||||||
def test_large_values(self) -> None:
|
def test_large_values(self) -> None:
|
||||||
"""Test with large values to check numerical stability."""
|
"""Test with large values to check numerical stability."""
|
||||||
|
Loading…
Reference in New Issue
Block a user