feat(core): Support i18n (#1327)

This commit is contained in:
Fangyin Cheng
2024-03-25 20:15:39 +08:00
committed by GitHub
parent fa06be64c1
commit fcc325d411
179 changed files with 12052 additions and 69512 deletions

View File

@@ -3,10 +3,13 @@
import logging
import os
from abc import ABC, abstractmethod
from typing import Any, Optional, Type
from typing import Any, List, Optional, Type
from dbgpt.component import BaseComponent, SystemApp
from dbgpt.core import Embeddings
from dbgpt.core.awel import DAGVar
from dbgpt.core.awel.flow import ResourceCategory, register_resource
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
@@ -221,3 +224,45 @@ class WrappedEmbeddingFactory(EmbeddingFactory):
if embedding_cls:
raise NotImplementedError
return self._model
@register_resource(
label=_("Default Embeddings"),
name="default_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_(
"Default embeddings(using default embedding model of current system)"
),
)
class DefaultEmbeddings(Embeddings):
"""The default embeddings."""
def __init__(self, embedding_factory: Optional[EmbeddingFactory] = None) -> None:
"""Create a new DefaultEmbeddings."""
self._embedding_factory = embedding_factory
@property
def embeddings(self) -> Embeddings:
"""Get the embeddings."""
if not self._embedding_factory:
system_app = DAGVar.get_current_system_app()
if not system_app:
raise ValueError("System app is not initialized")
self._embedding_factory = EmbeddingFactory.get_instance(system_app)
return self._embedding_factory.create()
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs."""
return self.embeddings.embed_documents(texts)
def embed_query(self, text: str) -> List[float]:
"""Embed query text."""
return self.embeddings.embed_query(text)
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
"""Asynchronous Embed search docs."""
return await self.embeddings.aembed_documents(texts)
async def aembed_query(self, text: str) -> List[float]:
"""Asynchronous Embed query text."""
return await self.embeddings.aembed_query(text)

View File

@@ -8,6 +8,8 @@ import requests
from dbgpt._private.pydantic import BaseModel, Extra, Field
from dbgpt.core import Embeddings
from dbgpt.core.awel.flow import Parameter, ResourceCategory, register_resource
from dbgpt.util.i18n_utils import _
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
@@ -22,6 +24,23 @@ DEFAULT_QUERY_BGE_INSTRUCTION_EN = (
DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:"
@register_resource(
_("HuggingFace Embeddings"),
"huggingface_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_("HuggingFace sentence_transformers embedding models."),
parameters=[
Parameter.build_from(
_("Model Name"),
"model_name",
str,
optional=True,
default=DEFAULT_MODEL_NAME,
description=_("Model name to use."),
),
# TODO, support more parameters
],
)
class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
@@ -112,6 +131,38 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
return self.embed_documents([text])[0]
@register_resource(
_("HuggingFace Instructor Embeddings"),
"huggingface_instructor_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_("HuggingFace Instructor embeddings."),
parameters=[
Parameter.build_from(
_("Model Name"),
"model_name",
str,
optional=True,
default=DEFAULT_INSTRUCT_MODEL,
description=_("Model name to use."),
),
Parameter.build_from(
_("Embed Instruction"),
"embed_instruction",
str,
optional=True,
default=DEFAULT_EMBED_INSTRUCTION,
description=_("Instruction to use for embedding documents."),
),
Parameter.build_from(
_("Query Instruction"),
"query_instruction",
str,
optional=True,
default=DEFAULT_QUERY_INSTRUCTION,
description=_("Instruction to use for embedding query."),
),
],
)
class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
"""Wrapper around sentence_transformers embedding models.
@@ -192,6 +243,7 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
return embedding.tolist()
# TODO: Support AWEL flow
class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
"""HuggingFace BGE sentence_transformers embedding models.
@@ -280,6 +332,28 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
return embedding.tolist()
@register_resource(
_("HuggingFace Inference API Embeddings"),
"huggingface_inference_api_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_("HuggingFace Inference API embeddings."),
parameters=[
Parameter.build_from(
_("API Key"),
"api_key",
str,
description=_("Your API key for the HuggingFace Inference API."),
),
Parameter.build_from(
_("Model Name"),
"model_name",
str,
optional=True,
default="sentence-transformers/all-MiniLM-L6-v2",
description=_("The name of the model to use for text embeddings."),
),
],
)
class HuggingFaceInferenceAPIEmbeddings(BaseModel, Embeddings):
"""Embed texts using the HuggingFace API.
@@ -371,6 +445,28 @@ def _handle_request_result(res: requests.Response) -> List[List[float]]:
return [result["embedding"] for result in sorted_embeddings]
@register_resource(
_("Jina AI Embeddings"),
"jina_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_("Jina AI embeddings."),
parameters=[
Parameter.build_from(
_("API Key"),
"api_key",
str,
description=_("Your API key for the Jina AI API."),
),
Parameter.build_from(
_("Model Name"),
"model_name",
str,
optional=True,
default="jina-embeddings-v2-base-en",
description=_("The name of the model to use for text embeddings."),
),
],
)
class JinaEmbeddings(BaseModel, Embeddings):
"""Jina AI embeddings.
@@ -431,6 +527,46 @@ class JinaEmbeddings(BaseModel, Embeddings):
return self.embed_documents([text])[0]
@register_resource(
_("OpenAPI Embeddings"),
"openapi_embeddings",
category=ResourceCategory.EMBEDDINGS,
description=_("OpenAPI embeddings."),
parameters=[
Parameter.build_from(
_("API URL"),
"api_url",
str,
optional=True,
default="http://localhost:8100/api/v1/embeddings",
description=_("The URL of the embeddings API."),
),
Parameter.build_from(
_("API Key"),
"api_key",
str,
optional=True,
default=None,
description=_("Your API key for the Open API."),
),
Parameter.build_from(
_("Model Name"),
"model_name",
str,
optional=True,
default="text2vec",
description=_("The name of the model to use for text embeddings."),
),
Parameter.build_from(
_("Timeout"),
"timeout",
int,
optional=True,
default=60,
description=_("The timeout for the request in seconds."),
),
],
)
class OpenAPIEmbeddings(BaseModel, Embeddings):
"""The OpenAPI embeddings.