mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 00:47:27 +00:00
fix[core]: added error message if the query vector or embedding contains NaN values (#31822)
**Description:** Added an explicit validation step in `langchain_core.vectorstores.utils._cosine_similarity` to raise a `ValueError` if the input query or any embedding contains `NaN` values. This prevents silent failures or unstable behavior during similarity calculations, especially when using maximal_marginal_relevance. **Issue**: Fixes #31806 **Dependencies:** None --------- Co-authored-by: Azhagammal S C <azhagammal@kofluence.com> Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
parent
a8998a1f57
commit
4d9c0b0883
@ -64,6 +64,9 @@ def _cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
|
||||
# Ignore divide by zero errors run time warnings as those are handled below.
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
similarity = np.dot(x, y.T) / np.outer(x_norm, y_norm)
|
||||
if np.isnan(similarity).all():
|
||||
msg = "NaN values found, please remove the NaN values and try again"
|
||||
raise ValueError(msg) from None
|
||||
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
||||
return similarity
|
||||
|
||||
|
@ -42,9 +42,8 @@ class TestCosineSimilarity:
|
||||
"""Test cosine similarity with zero vector."""
|
||||
x: list[list[float]] = [[0, 0, 0]]
|
||||
y: list[list[float]] = [[1, 2, 3]]
|
||||
result = _cosine_similarity(x, y)
|
||||
expected = np.array([[0.0]])
|
||||
np.testing.assert_array_almost_equal(result, expected)
|
||||
with pytest.raises(ValueError, match="NaN values found"):
|
||||
_cosine_similarity(x, y)
|
||||
|
||||
def test_multiple_vectors(self) -> None:
|
||||
"""Test cosine similarity with multiple vectors."""
|
||||
@ -115,13 +114,8 @@ class TestCosineSimilarity:
|
||||
# Create vectors that would result in NaN/inf in similarity calculation
|
||||
x: list[list[float]] = [[0, 0]] # Zero vector
|
||||
y: list[list[float]] = [[0, 0]] # Zero vector
|
||||
result = _cosine_similarity(x, y)
|
||||
|
||||
# Should return 0.0 instead of NaN
|
||||
expected = np.array([[0.0]])
|
||||
np.testing.assert_array_equal(result, expected)
|
||||
assert not np.isnan(result).any()
|
||||
assert not np.isinf(result).any()
|
||||
with pytest.raises(ValueError, match="NaN values found"):
|
||||
_cosine_similarity(x, y)
|
||||
|
||||
def test_large_values(self) -> None:
|
||||
"""Test with large values to check numerical stability."""
|
||||
|
Loading…
Reference in New Issue
Block a user