community: Add Naver chat model & embeddings (#25162)

Reopened as a personal repo outside the organization.

## Description
- Naver HyperCLOVA X community package 
  - Add chat model & embeddings
  - Add unit test & integration test
  - Add chat model & embeddings docs
- I changed partner
package(https://github.com/langchain-ai/langchain/pull/24252) to
community package on this PR
- Could this
embeddings(https://github.com/langchain-ai/langchain/pull/21890) be
deprecated? We are trying to replace it with embedding
model(**ClovaXEmbeddings**) in this PR.

Twitter handle: None. (if needed, contact with
joonha.jeon@navercorp.com)

---
you can check our previous discussion below:

> one question on namespaces - would it make sense to have these in
.clova namespaces instead of .naver?

I would like to keep it as is, unless it is essential to unify the
package name.
(ClovaX is a branding for the model, and I plan to add other models and
components. They need to be managed as separate classes.)

> also, could you clarify the difference between ClovaEmbeddings and
ClovaXEmbeddings?

There are 3 models that are being serviced by embedding, and all are
supported in the current PR. In addition, all the functionality of CLOVA
Studio that serves actual models, such as distinguishing between test
apps and service apps, is supported. The existing PR does not support
this content because it is hard-coded.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Vadym Barda <vadym@langchain.dev>
This commit is contained in:
CLOVA Studio 개발
2024-10-25 05:54:13 +09:00
committed by GitHub
parent 6227396e20
commit 846a75284f
17 changed files with 1871 additions and 1 deletions

View File

@@ -0,0 +1,71 @@
"""Test ChatNaver chat model."""
from langchain_core.messages import AIMessage, AIMessageChunk
from langchain_community.chat_models import ChatClovaX
def test_stream() -> None:
"""Test streaming tokens from ChatClovaX."""
llm = ChatClovaX()
for token in llm.stream("I'm Clova"):
assert isinstance(token, AIMessageChunk)
assert isinstance(token.content, str)
async def test_astream() -> None:
"""Test streaming tokens from ChatClovaX."""
llm = ChatClovaX()
async for token in llm.astream("I'm Clova"):
assert isinstance(token, AIMessageChunk)
assert isinstance(token.content, str)
async def test_abatch() -> None:
"""Test streaming tokens from ChatClovaX."""
llm = ChatClovaX()
result = await llm.abatch(["I'm Clova", "I'm not Clova"])
for token in result:
assert isinstance(token, AIMessage)
assert isinstance(token.content, str)
async def test_abatch_tags() -> None:
"""Test batch tokens from ChatClovaX."""
llm = ChatClovaX()
result = await llm.abatch(["I'm Clova", "I'm not Clova"], config={"tags": ["foo"]})
for token in result:
assert isinstance(token, AIMessage)
assert isinstance(token.content, str)
def test_batch() -> None:
"""Test batch tokens from ChatClovaX."""
llm = ChatClovaX()
result = llm.batch(["I'm Clova", "I'm not Clova"])
for token in result:
assert isinstance(token, AIMessage)
assert isinstance(token.content, str)
async def test_ainvoke() -> None:
"""Test invoke tokens from ChatClovaX."""
llm = ChatClovaX()
result = await llm.ainvoke("I'm Clova", config={"tags": ["foo"]})
assert isinstance(result, AIMessage)
assert isinstance(result.content, str)
def test_invoke() -> None:
"""Test invoke tokens from ChatClovaX."""
llm = ChatClovaX()
result = llm.invoke("I'm Clova", config=dict(tags=["foo"]))
assert isinstance(result, AIMessage)
assert isinstance(result.content, str)

View File

@@ -0,0 +1,37 @@
"""Test Naver embeddings."""
from langchain_community.embeddings import ClovaXEmbeddings
def test_embedding_documents() -> None:
"""Test cohere embeddings."""
documents = ["foo bar"]
embedding = ClovaXEmbeddings()
output = embedding.embed_documents(documents)
assert len(output) == 1
assert len(output[0]) > 0
async def test_aembedding_documents() -> None:
"""Test cohere embeddings."""
documents = ["foo bar"]
embedding = ClovaXEmbeddings()
output = await embedding.aembed_documents(documents)
assert len(output) == 1
assert len(output[0]) > 0
def test_embedding_query() -> None:
"""Test cohere embeddings."""
document = "foo bar"
embedding = ClovaXEmbeddings()
output = embedding.embed_query(document)
assert len(output) > 0
async def test_aembedding_query() -> None:
"""Test cohere embeddings."""
document = "foo bar"
embedding = ClovaXEmbeddings()
output = await embedding.aembed_query(document)
assert len(output) > 0

View File

@@ -6,6 +6,7 @@ EXPECTED_ALL = [
"ChatAnthropic",
"ChatAnyscale",
"ChatBaichuan",
"ChatClovaX",
"ChatCohere",
"ChatCoze",
"ChatDatabricks",

View File

@@ -0,0 +1,197 @@
"""Test chat model integration."""
import json
import os
from typing import Any, AsyncGenerator, Generator, cast
from unittest.mock import patch
import pytest
from langchain_core.callbacks import BaseCallbackHandler
from langchain_core.messages import (
AIMessage,
HumanMessage,
SystemMessage,
)
from pydantic import SecretStr
from langchain_community.chat_models import ChatClovaX
from langchain_community.chat_models.naver import (
_convert_message_to_naver_chat_message,
_convert_naver_chat_message_to_message,
)
os.environ["NCP_CLOVASTUDIO_API_KEY"] = "test_api_key"
os.environ["NCP_APIGW_API_KEY"] = "test_gw_key"
def test_initialization_api_key() -> None:
"""Test chat model initialization."""
chat_model = ChatClovaX(api_key="foo", apigw_api_key="bar") # type: ignore[arg-type]
assert (
cast(SecretStr, chat_model.ncp_clovastudio_api_key).get_secret_value() == "foo"
)
assert cast(SecretStr, chat_model.ncp_apigw_api_key).get_secret_value() == "bar"
def test_initialization_model_name() -> None:
llm = ChatClovaX(model="HCX-DASH-001") # type: ignore[call-arg]
assert llm.model_name == "HCX-DASH-001"
llm = ChatClovaX(model_name="HCX-DASH-001")
assert llm.model_name == "HCX-DASH-001"
def test_convert_dict_to_message_human() -> None:
message = {"role": "user", "content": "foo"}
result = _convert_naver_chat_message_to_message(message)
expected_output = HumanMessage(content="foo")
assert result == expected_output
assert _convert_message_to_naver_chat_message(expected_output) == message
def test_convert_dict_to_message_ai() -> None:
message = {"role": "assistant", "content": "foo"}
result = _convert_naver_chat_message_to_message(message)
expected_output = AIMessage(content="foo")
assert result == expected_output
assert _convert_message_to_naver_chat_message(expected_output) == message
def test_convert_dict_to_message_system() -> None:
message = {"role": "system", "content": "foo"}
result = _convert_naver_chat_message_to_message(message)
expected_output = SystemMessage(content="foo")
assert result == expected_output
assert _convert_message_to_naver_chat_message(expected_output) == message
@pytest.fixture
def mock_chat_completion_response() -> dict:
return {
"status": {"code": "20000", "message": "OK"},
"result": {
"message": {
"role": "assistant",
"content": "Phrases: Record what happened today and prepare "
"for tomorrow. "
"The diary will make your life richer.",
},
"stopReason": "LENGTH",
"inputLength": 100,
"outputLength": 10,
"aiFilter": [
{"groupName": "curse", "name": "insult", "score": "1"},
{"groupName": "curse", "name": "discrimination", "score": "0"},
{
"groupName": "unsafeContents",
"name": "sexualHarassment",
"score": "2",
},
],
},
}
def test_naver_invoke(mock_chat_completion_response: dict) -> None:
llm = ChatClovaX()
completed = False
def mock_completion_with_retry(*args: Any, **kwargs: Any) -> Any:
nonlocal completed
completed = True
return mock_chat_completion_response
with patch.object(ChatClovaX, "_completion_with_retry", mock_completion_with_retry):
res = llm.invoke("Let's test it.")
assert (
res.content
== "Phrases: Record what happened today and prepare for tomorrow. "
"The diary will make your life richer."
)
assert completed
async def test_naver_ainvoke(mock_chat_completion_response: dict) -> None:
llm = ChatClovaX()
completed = False
async def mock_acompletion_with_retry(*args: Any, **kwargs: Any) -> Any:
nonlocal completed
completed = True
return mock_chat_completion_response
with patch.object(
ChatClovaX, "_acompletion_with_retry", mock_acompletion_with_retry
):
res = await llm.ainvoke("Let's test it.")
assert (
res.content
== "Phrases: Record what happened today and prepare for tomorrow. "
"The diary will make your life richer."
)
assert completed
def _make_completion_response_from_token(token: str): # type: ignore[no-untyped-def]
from httpx_sse import ServerSentEvent
return ServerSentEvent(
event="token",
data=json.dumps(
dict(
index=0,
inputLength=89,
outputLength=1,
message=dict(
content=token,
role="assistant",
),
)
),
)
def mock_chat_stream(*args: Any, **kwargs: Any) -> Generator:
def it() -> Generator:
for token in ["Hello", " how", " can", " I", " help", "?"]:
yield _make_completion_response_from_token(token)
return it()
async def mock_chat_astream(*args: Any, **kwargs: Any) -> AsyncGenerator:
async def it() -> AsyncGenerator:
for token in ["Hello", " how", " can", " I", " help", "?"]:
yield _make_completion_response_from_token(token)
return it()
class MyCustomHandler(BaseCallbackHandler):
last_token: str = ""
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
self.last_token = token
@patch(
"langchain_community.chat_models.ChatClovaX._completion_with_retry",
new=mock_chat_stream,
)
@pytest.mark.requires("httpx_sse")
def test_stream_with_callback() -> None:
callback = MyCustomHandler()
chat = ChatClovaX(callbacks=[callback])
for token in chat.stream("Hello"):
assert callback.last_token == token.content
@patch(
"langchain_community.chat_models.ChatClovaX._acompletion_with_retry",
new=mock_chat_astream,
)
@pytest.mark.requires("httpx_sse")
async def test_astream_with_callback() -> None:
callback = MyCustomHandler()
chat = ChatClovaX(callbacks=[callback])
async for token in chat.astream("Hello"):
assert callback.last_token == token.content

View File

@@ -7,6 +7,7 @@ EXPECTED_ALL = [
"AzureOpenAIEmbeddings",
"BaichuanTextEmbeddings",
"ClarifaiEmbeddings",
"ClovaXEmbeddings",
"CohereEmbeddings",
"DatabricksEmbeddings",
"ElasticsearchEmbeddings",

View File

@@ -0,0 +1,18 @@
"""Test embedding model integration."""
import os
from typing import cast
from pydantic import SecretStr
from langchain_community.embeddings import ClovaXEmbeddings
os.environ["NCP_CLOVASTUDIO_API_KEY"] = "test_api_key"
os.environ["NCP_APIGW_API_KEY"] = "test_gw_key"
os.environ["NCP_CLOVASTUDIO_APP_ID"] = "test_app_id"
def test_initialization_api_key() -> None:
llm = ClovaXEmbeddings(api_key="foo", apigw_api_key="bar") # type: ignore[arg-type]
assert cast(SecretStr, llm.ncp_clovastudio_api_key).get_secret_value() == "foo"
assert cast(SecretStr, llm.ncp_apigw_api_key).get_secret_value() == "bar"

View File

@@ -43,6 +43,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"SQLAlchemy",
"aiohttp",
"dataclasses-json",
"httpx-sse",
"langchain-core",
"langsmith",
"numpy",