mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-08 04:25:46 +00:00
Dev2049/hf emb encode kwargs (#3925)
Thanks @amogkam for the addition! Refactored slightly --------- Co-authored-by: Amog Kamsetty <amogkam@users.noreply.github.com>
This commit is contained in:
parent
ffc87233a1
commit
5db6b796cf
@ -36,6 +36,8 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
|||||||
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
|
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
|
||||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||||
"""Key word arguments to pass to the model."""
|
"""Key word arguments to pass to the model."""
|
||||||
|
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
"""Key word arguments to pass when calling the `encode` method of the model."""
|
||||||
|
|
||||||
def __init__(self, **kwargs: Any):
|
def __init__(self, **kwargs: Any):
|
||||||
"""Initialize the sentence_transformer."""
|
"""Initialize the sentence_transformer."""
|
||||||
@ -68,7 +70,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
|||||||
List of embeddings, one for each text.
|
List of embeddings, one for each text.
|
||||||
"""
|
"""
|
||||||
texts = list(map(lambda x: x.replace("\n", " "), texts))
|
texts = list(map(lambda x: x.replace("\n", " "), texts))
|
||||||
embeddings = self.client.encode(texts)
|
embeddings = self.client.encode(texts, **self.encode_kwargs)
|
||||||
return embeddings.tolist()
|
return embeddings.tolist()
|
||||||
|
|
||||||
def embed_query(self, text: str) -> List[float]:
|
def embed_query(self, text: str) -> List[float]:
|
||||||
@ -81,7 +83,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
|||||||
Embeddings for the text.
|
Embeddings for the text.
|
||||||
"""
|
"""
|
||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", " ")
|
||||||
embedding = self.client.encode(text)
|
embedding = self.client.encode(text, **self.encode_kwargs)
|
||||||
return embedding.tolist()
|
return embedding.tolist()
|
||||||
|
|
||||||
|
|
||||||
@ -89,7 +91,7 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
|
|||||||
"""Wrapper around sentence_transformers embedding models.
|
"""Wrapper around sentence_transformers embedding models.
|
||||||
|
|
||||||
To use, you should have the ``sentence_transformers``
|
To use, you should have the ``sentence_transformers``
|
||||||
and ``InstructorEmbedding`` python package installed.
|
and ``InstructorEmbedding`` python packages installed.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
@ -108,7 +110,7 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
|
|||||||
"""Model name to use."""
|
"""Model name to use."""
|
||||||
cache_folder: Optional[str] = None
|
cache_folder: Optional[str] = None
|
||||||
"""Path to store models.
|
"""Path to store models.
|
||||||
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
|
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
|
||||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||||
"""Key word arguments to pass to the model."""
|
"""Key word arguments to pass to the model."""
|
||||||
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION
|
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
"""Test huggingface embeddings."""
|
"""Test huggingface embeddings."""
|
||||||
import unittest
|
|
||||||
|
|
||||||
from langchain.embeddings.huggingface import (
|
from langchain.embeddings.huggingface import (
|
||||||
HuggingFaceEmbeddings,
|
HuggingFaceEmbeddings,
|
||||||
@ -7,7 +6,6 @@ from langchain.embeddings.huggingface import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@unittest.skip("This test causes a segfault.")
|
|
||||||
def test_huggingface_embedding_documents() -> None:
|
def test_huggingface_embedding_documents() -> None:
|
||||||
"""Test huggingface embeddings."""
|
"""Test huggingface embeddings."""
|
||||||
documents = ["foo bar"]
|
documents = ["foo bar"]
|
||||||
@ -17,11 +15,10 @@ def test_huggingface_embedding_documents() -> None:
|
|||||||
assert len(output[0]) == 768
|
assert len(output[0]) == 768
|
||||||
|
|
||||||
|
|
||||||
@unittest.skip("This test causes a segfault.")
|
|
||||||
def test_huggingface_embedding_query() -> None:
|
def test_huggingface_embedding_query() -> None:
|
||||||
"""Test huggingface embeddings."""
|
"""Test huggingface embeddings."""
|
||||||
document = "foo bar"
|
document = "foo bar"
|
||||||
embedding = HuggingFaceEmbeddings()
|
embedding = HuggingFaceEmbeddings(encode_kwargs={"batch_size": 16})
|
||||||
output = embedding.embed_query(document)
|
output = embedding.embed_query(document)
|
||||||
assert len(output) == 768
|
assert len(output) == 768
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user