From d7c14cb6f9871056e17171fa161c7d73d0691b3f Mon Sep 17 00:00:00 2001 From: hulitaitai <146365078+hulitaitai@users.noreply.github.com> Date: Tue, 26 Mar 2024 23:06:58 +0800 Subject: [PATCH] community[minor]: Add embeddings integration for text2vec (#19267) Create a Class which allows to use the "text2vec" open source embedding model. It should install the model by running 'pip install -U text2vec'. Example to call the model through LangChain: from langchain_community.embeddings.text2vec import Text2vecEmbeddings embedding = Text2vecEmbeddings() bookend.embed_documents([ "This is a CoSENT(Cosine Sentence) model.", "It maps sentences to a 768 dimensional dense vector space.", ]) bookend.embed_query( "It can be used for text matching or semantic search." ) --------- Co-authored-by: Bagatur Co-authored-by: Eugene Yurtsev Co-authored-by: Eugene Yurtsev --- .../embeddings/text2vec.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 libs/community/langchain_community/embeddings/text2vec.py diff --git a/libs/community/langchain_community/embeddings/text2vec.py b/libs/community/langchain_community/embeddings/text2vec.py new file mode 100644 index 00000000000..98cf5498c14 --- /dev/null +++ b/libs/community/langchain_community/embeddings/text2vec.py @@ -0,0 +1,78 @@ +"""Wrapper around text2vec embedding models.""" + +from typing import Any, List, Optional + +from langchain_core.embeddings import Embeddings +from langchain_core.pydantic_v1 import BaseModel + + +class Text2vecEmbeddings(Embeddings, BaseModel): + """text2vec embedding models. + + Install text2vec first, run 'pip install -U text2vec'. + + Example: + .. code-block:: python + + from langchain_community.embeddings.text2vec import Text2vecEmbeddings + + embedding = Text2vecEmbeddings() + bookend.embed_documents([ + "This is a CoSENT(Cosine Sentence) model.", + "It maps sentences to a 768 dimensional dense vector space.", + ]) + bookend.embed_query( + "It can be used for text matching or semantic search." + ) + """ + + model_name_or_path: Optional[str] = None + encoder_type: Any = "MEAN" + max_seq_length: int = 256 + device: Optional[str] = None + model: Any = None + + def __init__( + self, + *, + model: Any = None, + model_name_or_path: Optional[str] = None, + **kwargs: Any, + ): + try: + from text2vec import SentenceModel + except ImportError as e: + raise ImportError( + "Unable to import text2vec, please install with " + "`pip install -U text2vec`." + ) from e + + model_kwargs = {} + if model_name_or_path is not None: + model_kwargs["model_name_or_path"] = model_name_or_path + model = model or SentenceModel(**model_kwargs, **kwargs) + super().__init__(model=model, model_name_or_path=model_name_or_path, **kwargs) + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed documents using the text2vec embeddings model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + + return self.model.encode(texts) + + def embed_query(self, text: str) -> List[float]: + """Embed a query using the text2vec embeddings model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + + return self.model.encode(text)