community: fixed bug in model2vec embedding code (#28670)

This PR fixes a bug with the current implementation for Model2Vec
embeddings where `embed_documents` does not work as expected.

- **Description**: the current implementation uses `encode_as_sequence`
for encoding documents. This is incorrect, as `encode_as_sequence`
creates token embeddings and not mean embeddings. The normal `encode`
function handles both single and batched inputs and should be used
instead. The return type was also incorrect, as encode returns a NumPy
array. This PR converts the embedding to a list so that the output is
consistent with the Embeddings ABC.
This commit is contained in:
Thomas van Dongen 2024-12-12 00:50:56 +01:00 committed by GitHub
parent b20230c800
commit ee640d6bd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -6,7 +6,7 @@ from langchain_core.embeddings import Embeddings
class Model2vecEmbeddings(Embeddings): class Model2vecEmbeddings(Embeddings):
"""model2v embedding models. """Model2Vec embedding models.
Install model2vec first, run 'pip install -U model2vec'. Install model2vec first, run 'pip install -U model2vec'.
The github repository for model2vec is : https://github.com/MinishLab/model2vec The github repository for model2vec is : https://github.com/MinishLab/model2vec
@ -51,7 +51,7 @@ class Model2vecEmbeddings(Embeddings):
List of embeddings, one for each text. List of embeddings, one for each text.
""" """
return self._model.encode_as_sequence(texts) return self._model.encode(texts).tolist()
def embed_query(self, text: str) -> List[float]: def embed_query(self, text: str) -> List[float]:
"""Embed a query using the model2vec embeddings model. """Embed a query using the model2vec embeddings model.
@ -63,4 +63,4 @@ class Model2vecEmbeddings(Embeddings):
Embeddings for the text. Embeddings for the text.
""" """
return self._model.encode(text) return self._model.encode(text).tolist()