fix[core]: added error message if the query vector or embedding contains NaN values (#31822)

**Description:**  
Added an explicit validation step in
`langchain_core.vectorstores.utils._cosine_similarity` to raise a
`ValueError` if the input query or any embedding contains `NaN` values.
This prevents silent failures or unstable behavior during similarity
calculations, especially when using maximal_marginal_relevance.

**Issue**:
Fixes #31806 

**Dependencies:**  
None

---------

Co-authored-by: Azhagammal S C <azhagammal@kofluence.com>
Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
Azhagammal 2025-07-10 04:00:26 +05:30 committed by GitHub
parent a8998a1f57
commit 4d9c0b0883
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 7 additions and 10 deletions

View File

@ -64,6 +64,9 @@ def _cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
# Ignore divide by zero errors run time warnings as those are handled below.
with np.errstate(divide="ignore", invalid="ignore"):
similarity = np.dot(x, y.T) / np.outer(x_norm, y_norm)
if np.isnan(similarity).all():
msg = "NaN values found, please remove the NaN values and try again"
raise ValueError(msg) from None
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
return similarity

View File

@ -42,9 +42,8 @@ class TestCosineSimilarity:
"""Test cosine similarity with zero vector."""
x: list[list[float]] = [[0, 0, 0]]
y: list[list[float]] = [[1, 2, 3]]
result = _cosine_similarity(x, y)
expected = np.array([[0.0]])
np.testing.assert_array_almost_equal(result, expected)
with pytest.raises(ValueError, match="NaN values found"):
_cosine_similarity(x, y)
def test_multiple_vectors(self) -> None:
"""Test cosine similarity with multiple vectors."""
@ -115,13 +114,8 @@ class TestCosineSimilarity:
# Create vectors that would result in NaN/inf in similarity calculation
x: list[list[float]] = [[0, 0]] # Zero vector
y: list[list[float]] = [[0, 0]] # Zero vector
result = _cosine_similarity(x, y)
# Should return 0.0 instead of NaN
expected = np.array([[0.0]])
np.testing.assert_array_equal(result, expected)
assert not np.isnan(result).any()
assert not np.isinf(result).any()
with pytest.raises(ValueError, match="NaN values found"):
_cosine_similarity(x, y)
def test_large_values(self) -> None:
"""Test with large values to check numerical stability."""