mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
community[patch]: update OctoAIEmbeddings to subclass OpenAIEmbeddings (#21805)
This commit is contained in:
parent
ded53297e0
commit
1bc0ea5496
@ -1,100 +1,86 @@
|
|||||||
from typing import Any, Dict, List, Mapping, Optional
|
from typing import Dict
|
||||||
|
|
||||||
from langchain_core.embeddings import Embeddings
|
from langchain_core.pydantic_v1 import Field, SecretStr, root_validator
|
||||||
from langchain_core.pydantic_v1 import BaseModel, Extra, Field, root_validator
|
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
|
||||||
from langchain_core.utils import get_from_dict_or_env
|
|
||||||
|
|
||||||
DEFAULT_EMBED_INSTRUCTION = "Represent this input: "
|
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||||
DEFAULT_QUERY_INSTRUCTION = "Represent the question for retrieving similar documents: "
|
from langchain_community.utils.openai import is_openai_v1
|
||||||
|
|
||||||
|
DEFAULT_API_BASE = "https://text.octoai.run/v1/"
|
||||||
|
DEFAULT_MODEL = "thenlper/gte-large"
|
||||||
|
|
||||||
|
|
||||||
class OctoAIEmbeddings(BaseModel, Embeddings):
|
class OctoAIEmbeddings(OpenAIEmbeddings):
|
||||||
"""OctoAI Compute Service embedding models.
|
"""OctoAI Compute Service embedding models.
|
||||||
|
|
||||||
The environment variable ``OCTOAI_API_TOKEN`` should be set
|
See https://octo.ai/ for information about OctoAI.
|
||||||
with your API token, or it can be passed
|
|
||||||
as a named parameter to the constructor.
|
To use, you should have the ``openai`` python package installed and the
|
||||||
|
environment variable ``OCTOAI_API_TOKEN`` set with your API token.
|
||||||
|
Alternatively, you can use the octoai_api_token keyword argument.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
endpoint_url: Optional[str] = Field(None, description="Endpoint URL to use.")
|
octoai_api_token: SecretStr = Field(default=None)
|
||||||
model_kwargs: Optional[dict] = Field(
|
"""OctoAI Endpoints API keys."""
|
||||||
None, description="Keyword arguments to pass to the model."
|
endpoint_url: str = Field(default=DEFAULT_API_BASE)
|
||||||
)
|
"""Base URL path for API requests."""
|
||||||
octoai_api_token: Optional[str] = Field(None, description="OCTOAI API Token")
|
model: str = Field(default=DEFAULT_MODEL)
|
||||||
embed_instruction: str = Field(
|
"""Model name to use."""
|
||||||
DEFAULT_EMBED_INSTRUCTION,
|
tiktoken_enabled: bool = False
|
||||||
description="Instruction to use for embedding documents.",
|
"""Set this to False for non-OpenAI implementations of the embeddings API"""
|
||||||
)
|
|
||||||
query_instruction: str = Field(
|
|
||||||
DEFAULT_QUERY_INSTRUCTION, description="Instruction to use for embedding query."
|
|
||||||
)
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
"""Configuration for this pydantic object."""
|
|
||||||
|
|
||||||
extra = Extra.forbid
|
|
||||||
|
|
||||||
@root_validator(allow_reuse=True)
|
|
||||||
def validate_environment(cls, values: Dict) -> Dict:
|
|
||||||
"""Ensure that the API key and python package exist in environment."""
|
|
||||||
values["octoai_api_token"] = get_from_dict_or_env(
|
|
||||||
values, "octoai_api_token", "OCTOAI_API_TOKEN"
|
|
||||||
)
|
|
||||||
values["endpoint_url"] = get_from_dict_or_env(
|
|
||||||
values, "endpoint_url", "https://text.octoai.run/v1/embeddings"
|
|
||||||
)
|
|
||||||
return values
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _identifying_params(self) -> Mapping[str, Any]:
|
def _llm_type(self) -> str:
|
||||||
"""Return the identifying parameters."""
|
"""Return type of embeddings model."""
|
||||||
return {
|
return "octoai-embeddings"
|
||||||
"endpoint_url": self.endpoint_url,
|
|
||||||
"model_kwargs": self.model_kwargs or {},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _compute_embeddings(
|
@property
|
||||||
self, texts: List[str], instruction: str
|
def lc_secrets(self) -> Dict[str, str]:
|
||||||
) -> List[List[float]]:
|
return {"octoai_api_token": "OCTOAI_API_TOKEN"}
|
||||||
"""Compute embeddings using an OctoAI instruct model."""
|
|
||||||
from octoai import client
|
|
||||||
|
|
||||||
embedding = []
|
@root_validator()
|
||||||
embeddings = []
|
def validate_environment(cls, values: dict) -> dict:
|
||||||
octoai_client = client.Client(token=self.octoai_api_token)
|
"""Validate that api key and python package exists in environment."""
|
||||||
|
values["endpoint_url"] = get_from_dict_or_env(
|
||||||
|
values,
|
||||||
|
"endpoint_url",
|
||||||
|
"ENDPOINT_URL",
|
||||||
|
default=DEFAULT_API_BASE,
|
||||||
|
)
|
||||||
|
values["octoai_api_token"] = convert_to_secret_str(
|
||||||
|
get_from_dict_or_env(values, "octoai_api_token", "OCTOAI_API_TOKEN")
|
||||||
|
)
|
||||||
|
values["model"] = get_from_dict_or_env(
|
||||||
|
values,
|
||||||
|
"model",
|
||||||
|
"MODEL",
|
||||||
|
default=DEFAULT_MODEL,
|
||||||
|
)
|
||||||
|
|
||||||
for text in texts:
|
try:
|
||||||
parameter_payload = {
|
import openai
|
||||||
"sentence": str([text]),
|
|
||||||
"input": str([text]),
|
|
||||||
"instruction": str([instruction]),
|
|
||||||
"model": "thenlper/gte-large",
|
|
||||||
"parameters": self.model_kwargs or {},
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
if is_openai_v1():
|
||||||
resp_json = octoai_client.infer(self.endpoint_url, parameter_payload)
|
client_params = {
|
||||||
if "embeddings" in resp_json:
|
"api_key": values["octoai_api_token"].get_secret_value(),
|
||||||
embedding = resp_json["embeddings"]
|
"base_url": values["endpoint_url"],
|
||||||
elif "data" in resp_json:
|
}
|
||||||
json_data = resp_json["data"]
|
if not values.get("client"):
|
||||||
for item in json_data:
|
values["client"] = openai.OpenAI(**client_params).embeddings
|
||||||
if "embedding" in item:
|
if not values.get("async_client"):
|
||||||
embedding = item["embedding"]
|
values["async_client"] = openai.AsyncOpenAI(
|
||||||
|
**client_params
|
||||||
|
).embeddings
|
||||||
|
else:
|
||||||
|
values["openai_api_base"] = values["endpoint_url"]
|
||||||
|
values["openai_api_key"] = values["octoai_api_token"].get_secret_value()
|
||||||
|
values["client"] = openai.Embedding
|
||||||
|
values["async_client"] = openai.Embedding
|
||||||
|
|
||||||
except Exception as e:
|
except ImportError:
|
||||||
raise ValueError(f"Error raised by the inference endpoint: {e}") from e
|
raise ImportError(
|
||||||
|
"Could not import openai python package. "
|
||||||
|
"Please install it with `pip install openai`."
|
||||||
|
)
|
||||||
|
|
||||||
embeddings.append(embedding)
|
return values
|
||||||
|
|
||||||
return embeddings
|
|
||||||
|
|
||||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
||||||
"""Compute document embeddings using an OctoAI instruct model."""
|
|
||||||
texts = list(map(lambda x: x.replace("\n", " "), texts))
|
|
||||||
return self._compute_embeddings(texts, self.embed_instruction)
|
|
||||||
|
|
||||||
def embed_query(self, text: str) -> List[float]:
|
|
||||||
"""Compute query embedding using an OctoAI instruct model."""
|
|
||||||
text = text.replace("\n", " ")
|
|
||||||
return self._compute_embeddings([text], self.query_instruction)[0]
|
|
||||||
|
@ -8,27 +8,15 @@ from langchain_community.embeddings.octoai_embeddings import (
|
|||||||
def test_octoai_embedding_documents() -> None:
|
def test_octoai_embedding_documents() -> None:
|
||||||
"""Test octoai embeddings."""
|
"""Test octoai embeddings."""
|
||||||
documents = ["foo bar"]
|
documents = ["foo bar"]
|
||||||
embedding = OctoAIEmbeddings(
|
embedding = OctoAIEmbeddings()
|
||||||
endpoint_url="<endpoint_url>",
|
|
||||||
octoai_api_token="<octoai_api_token>",
|
|
||||||
embed_instruction="Represent this input: ",
|
|
||||||
query_instruction="Represent this input: ",
|
|
||||||
model_kwargs=None,
|
|
||||||
)
|
|
||||||
output = embedding.embed_documents(documents)
|
output = embedding.embed_documents(documents)
|
||||||
assert len(output) == 1
|
assert len(output) == 1
|
||||||
assert len(output[0]) == 768
|
assert len(output[0]) == 1024
|
||||||
|
|
||||||
|
|
||||||
def test_octoai_embedding_query() -> None:
|
def test_octoai_embedding_query() -> None:
|
||||||
"""Test octoai embeddings."""
|
"""Test octoai embeddings."""
|
||||||
document = "foo bar"
|
document = "foo bar"
|
||||||
embedding = OctoAIEmbeddings(
|
embedding = OctoAIEmbeddings()
|
||||||
endpoint_url="<endpoint_url>",
|
|
||||||
octoai_api_token="<octoai_api_token>",
|
|
||||||
embed_instruction="Represent this input: ",
|
|
||||||
query_instruction="Represent this input: ",
|
|
||||||
model_kwargs=None,
|
|
||||||
)
|
|
||||||
output = embedding.embed_query(document)
|
output = embedding.embed_query(document)
|
||||||
assert len(output) == 768
|
assert len(output) == 1024
|
||||||
|
Loading…
Reference in New Issue
Block a user