From d01bad5169ae39aebee572e675a3792eb614a8f7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 25 Mar 2024 16:36:26 -0700 Subject: [PATCH] core[patch]: Convert SimSIMD back to NumPy (#19473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes the #18022 issue, converting the SimSIMD internal zero-copy outputs to NumPy. I've also noticed, that oftentimes `dtype=np.float32` conversion is used before passing to SimSIMD. Which numeric types do LangChain users generally care about? We support `float64`, `float32`, `float16`, and `int8` for cosine distances and `float16` seems reasonable for practically any kind of embeddings and any modern piece of hardware, so we can change that part as well 🤗 --- libs/community/langchain_community/utils/math.py | 2 +- .../elasticsearch/langchain_elasticsearch/_utilities.py | 2 +- libs/partners/mongodb/langchain_mongodb/utils.py | 2 +- libs/partners/pinecone/langchain_pinecone/_utilities.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/utils/math.py b/libs/community/langchain_community/utils/math.py index 99d47368197..2522c1255c6 100644 --- a/libs/community/langchain_community/utils/math.py +++ b/libs/community/langchain_community/utils/math.py @@ -29,7 +29,7 @@ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: Z = 1 - simd.cdist(X, Y, metric="cosine") if isinstance(Z, float): return np.array([Z]) - return Z + return np.array(Z) except ImportError: logger.info( "Unable to import simsimd, defaulting to NumPy implementation. If you want " diff --git a/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py b/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py index 0280708736f..33b302241ea 100644 --- a/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py +++ b/libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py @@ -79,7 +79,7 @@ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: Z = 1 - simd.cdist(X, Y, metric="cosine") if isinstance(Z, float): return np.array([Z]) - return Z + return np.array(Z) except ImportError: X_norm = np.linalg.norm(X, axis=1) Y_norm = np.linalg.norm(Y, axis=1) diff --git a/libs/partners/mongodb/langchain_mongodb/utils.py b/libs/partners/mongodb/langchain_mongodb/utils.py index feb34ad1c23..854b2bc939a 100644 --- a/libs/partners/mongodb/langchain_mongodb/utils.py +++ b/libs/partners/mongodb/langchain_mongodb/utils.py @@ -38,7 +38,7 @@ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: Z = 1 - simd.cdist(X, Y, metric="cosine") if isinstance(Z, float): return np.array([Z]) - return Z + return np.array(Z) except ImportError: logger.info( "Unable to import simsimd, defaulting to NumPy implementation. If you want " diff --git a/libs/partners/pinecone/langchain_pinecone/_utilities.py b/libs/partners/pinecone/langchain_pinecone/_utilities.py index 37d61e9fb11..5ad9e407fcd 100644 --- a/libs/partners/pinecone/langchain_pinecone/_utilities.py +++ b/libs/partners/pinecone/langchain_pinecone/_utilities.py @@ -69,7 +69,7 @@ def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: Z = 1 - simd.cdist(X, Y, metric="cosine") if isinstance(Z, float): return np.array([Z]) - return Z + return np.array(Z) except ImportError: X_norm = np.linalg.norm(X, axis=1) Y_norm = np.linalg.norm(Y, axis=1)