From ee640d6bd352b43a2d417617c0c2486c7d1b0997 Mon Sep 17 00:00:00 2001 From: Thomas van Dongen Date: Thu, 12 Dec 2024 00:50:56 +0100 Subject: [PATCH] community: fixed bug in model2vec embedding code (#28670) This PR fixes a bug with the current implementation for Model2Vec embeddings where `embed_documents` does not work as expected. - **Description**: the current implementation uses `encode_as_sequence` for encoding documents. This is incorrect, as `encode_as_sequence` creates token embeddings and not mean embeddings. The normal `encode` function handles both single and batched inputs and should be used instead. The return type was also incorrect, as encode returns a NumPy array. This PR converts the embedding to a list so that the output is consistent with the Embeddings ABC. --- libs/community/langchain_community/embeddings/model2vec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/embeddings/model2vec.py b/libs/community/langchain_community/embeddings/model2vec.py index 8cba54b0901..223f611b0ed 100644 --- a/libs/community/langchain_community/embeddings/model2vec.py +++ b/libs/community/langchain_community/embeddings/model2vec.py @@ -6,7 +6,7 @@ from langchain_core.embeddings import Embeddings class Model2vecEmbeddings(Embeddings): - """model2v embedding models. + """Model2Vec embedding models. Install model2vec first, run 'pip install -U model2vec'. The github repository for model2vec is : https://github.com/MinishLab/model2vec @@ -51,7 +51,7 @@ class Model2vecEmbeddings(Embeddings): List of embeddings, one for each text. """ - return self._model.encode_as_sequence(texts) + return self._model.encode(texts).tolist() def embed_query(self, text: str) -> List[float]: """Embed a query using the model2vec embeddings model. @@ -63,4 +63,4 @@ class Model2vecEmbeddings(Embeddings): Embeddings for the text. """ - return self._model.encode(text) + return self._model.encode(text).tolist()