diff --git a/docs/docs/integrations/text_embedding/clova.ipynb b/docs/docs/integrations/text_embedding/clova.ipynb new file mode 100644 index 00000000000..cc5aeeed5c9 --- /dev/null +++ b/docs/docs/integrations/text_embedding/clova.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clova Embeddings\n", + "[Clova](https://api.ncloud-docs.com/docs/ai-naver-clovastudio-summary) offers an embeddings service\n", + "\n", + "This example goes over how to use LangChain to interact with Clova inference for text embedding.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"CLOVA_EMB_API_KEY\"] = \"\"\n", + "os.environ[\"CLOVA_EMB_APIGW_API_KEY\"] = \"\"\n", + "os.environ[\"CLOVA_EMB_APP_ID\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings import ClovaEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = ClovaEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_text = \"This is a test query.\"\n", + "query_result = embeddings.embed_query(query_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document_text = [\"This is a test doc1.\", \"This is a test doc2.\"]\n", + "document_result = embeddings.embed_documents([document_text])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/langchain_community/embeddings/__init__.py b/libs/community/langchain_community/embeddings/__init__.py index 200741fc6e0..b6e3f883613 100644 --- a/libs/community/langchain_community/embeddings/__init__.py +++ b/libs/community/langchain_community/embeddings/__init__.py @@ -43,6 +43,9 @@ if TYPE_CHECKING: from langchain_community.embeddings.clarifai import ( ClarifaiEmbeddings, ) + from langchain_community.embeddings.clova import ( + ClovaEmbeddings, + ) from langchain_community.embeddings.cohere import ( CohereEmbeddings, ) @@ -232,6 +235,7 @@ __all__ = [ "BedrockEmbeddings", "BookendEmbeddings", "ClarifaiEmbeddings", + "ClovaEmbeddings", "CohereEmbeddings", "DashScopeEmbeddings", "DatabricksEmbeddings", @@ -309,6 +313,7 @@ _module_lookup = { "BedrockEmbeddings": "langchain_community.embeddings.bedrock", "BookendEmbeddings": "langchain_community.embeddings.bookend", "ClarifaiEmbeddings": "langchain_community.embeddings.clarifai", + "ClovaEmbeddings": "langchain_community.embeddings.clova", "CohereEmbeddings": "langchain_community.embeddings.cohere", "DashScopeEmbeddings": "langchain_community.embeddings.dashscope", "DatabricksEmbeddings": "langchain_community.embeddings.databricks", diff --git a/libs/community/langchain_community/embeddings/clova.py b/libs/community/langchain_community/embeddings/clova.py new file mode 100644 index 00000000000..59b28782e33 --- /dev/null +++ b/libs/community/langchain_community/embeddings/clova.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from typing import Dict, List, Optional, cast + +import requests +from langchain_core.embeddings import Embeddings +from langchain_core.pydantic_v1 import BaseModel, Extra, SecretStr, root_validator +from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env + + +class ClovaEmbeddings(BaseModel, Embeddings): + """ + Clova's embedding service. + + To use this service, + + you should have the following environment variables + set with your API tokens and application ID, + or pass them as named parameters to the constructor: + + - ``CLOVA_EMB_API_KEY``: API key for accessing Clova's embedding service. + - ``CLOVA_EMB_APIGW_API_KEY``: API gateway key for enhanced security. + - ``CLOVA_EMB_APP_ID``: Application ID for identifying your application. + + Example: + .. code-block:: python + + from langchain_community.embeddings import ClovaEmbeddings + embeddings = ClovaEmbeddings( + clova_emb_api_key='your_clova_emb_api_key', + clova_emb_apigw_api_key='your_clova_emb_apigw_api_key', + app_id='your_app_id' + ) + + query_text = "This is a test query." + query_result = embeddings.embed_query(query_text) + + document_text = "This is a test document." + document_result = embeddings.embed_documents([document_text]) + + """ + + endpoint_url: str = ( + "https://clovastudio.apigw.ntruss.com/testapp/v1/api-tools/embedding" + ) + """Endpoint URL to use.""" + model: str = "clir-emb-dolphin" + """Embedding model name to use.""" + clova_emb_api_key: Optional[SecretStr] = None + """API key for accessing Clova's embedding service.""" + clova_emb_apigw_api_key: Optional[SecretStr] = None + """API gateway key for enhanced security.""" + app_id: Optional[SecretStr] = None + """Application ID for identifying your application.""" + + class Config: + extra = Extra.forbid + + @root_validator(pre=True, allow_reuse=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate api key exists in environment.""" + values["clova_emb_api_key"] = convert_to_secret_str( + get_from_dict_or_env(values, "clova_emb_api_key", "CLOVA_EMB_API_KEY") + ) + values["clova_emb_apigw_api_key"] = convert_to_secret_str( + get_from_dict_or_env( + values, "clova_emb_apigw_api_key", "CLOVA_EMB_APIGW_API_KEY" + ) + ) + values["app_id"] = convert_to_secret_str( + get_from_dict_or_env(values, "app_id", "CLOVA_EMB_APP_ID") + ) + return values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """ + Embed a list of texts and return their embeddings. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + embeddings = [] + for text in texts: + embeddings.append(self._embed_text(text)) + return embeddings + + def embed_query(self, text: str) -> List[float]: + """ + Embed a single query text and return its embedding. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + return self._embed_text(text) + + def _embed_text(self, text: str) -> List[float]: + """ + Internal method to call the embedding API and handle the response. + """ + payload = {"text": text} + + # HTTP headers for authorization + headers = { + "X-NCP-CLOVASTUDIO-API-KEY": cast( + SecretStr, self.clova_emb_api_key + ).get_secret_value(), + "X-NCP-APIGW-API-KEY": cast( + SecretStr, self.clova_emb_apigw_api_key + ).get_secret_value(), + "Content-Type": "application/json", + } + + # send request + app_id = cast(SecretStr, self.app_id).get_secret_value() + response = requests.post( + f"{self.endpoint_url}/{self.model}/{app_id}", + headers=headers, + json=payload, + ) + + # check for errors + if response.status_code == 200: + response_data = response.json() + if "result" in response_data and "embedding" in response_data["result"]: + return response_data["result"]["embedding"] + raise ValueError( + f"API request failed with status {response.status_code}: {response.text}" + ) diff --git a/libs/community/tests/unit_tests/embeddings/test_imports.py b/libs/community/tests/unit_tests/embeddings/test_imports.py index f059e525051..fbf40de973b 100644 --- a/libs/community/tests/unit_tests/embeddings/test_imports.py +++ b/libs/community/tests/unit_tests/embeddings/test_imports.py @@ -1,6 +1,7 @@ from langchain_community.embeddings import __all__, _module_lookup EXPECTED_ALL = [ + "ClovaEmbeddings", "OpenAIEmbeddings", "AnyscaleEmbeddings", "AzureOpenAIEmbeddings",