feat(core): Support i18n (#1327)

This commit is contained in:
Fangyin Cheng
2024-03-25 20:15:39 +08:00
committed by GitHub
parent fa06be64c1
commit fcc325d411
179 changed files with 12052 additions and 69512 deletions

View File

@@ -10,7 +10,7 @@ from .embedding import ( # noqa: F401
EmbeddingRetrieverOperator,
)
from .evaluation import RetrieverEvaluatorOperator # noqa: F401
from .knowledge import KnowledgeOperator # noqa: F401
from .knowledge import ChunksToStringOperator, KnowledgeOperator # noqa: F401
from .rerank import RerankOperator # noqa: F401
from .rewrite import QueryRewriteOperator # noqa: F401
from .summary import SummaryAssemblerOperator # noqa: F401
@@ -22,6 +22,7 @@ __all__ = [
"EmbeddingRetrieverOperator",
"EmbeddingAssemblerOperator",
"KnowledgeOperator",
"ChunksToStringOperator",
"RerankOperator",
"QueryRewriteOperator",
"SummaryAssemblerOperator",

View File

@@ -4,8 +4,10 @@ from functools import reduce
from typing import List, Optional, Union
from dbgpt.core import Chunk
from dbgpt.core.awel.flow import IOField, OperatorCategory, Parameter, ViewMetadata
from dbgpt.core.interface.operators.retriever import RetrieverOperator
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.util.i18n_utils import _
from ..assembler.embedding import EmbeddingAssembler
from ..chunk_manager import ChunkParameters
@@ -19,6 +21,71 @@ from .assembler import AssemblerOperator
class EmbeddingRetrieverOperator(RetrieverOperator[Union[str, List[str]], List[Chunk]]):
"""The Embedding Retriever Operator."""
metadata = ViewMetadata(
label=_("Embedding Retriever Operator"),
name="embedding_retriever_operator",
description=_("Retrieve candidates from vector store."),
category=OperatorCategory.RAG,
parameters=[
Parameter.build_from(
_("Vector Store Connector"),
"vector_store_connector",
VectorStoreConnector,
description=_("The vector store connector."),
),
Parameter.build_from(
_("Top K"),
"top_k",
int,
description=_("The number of candidates."),
),
Parameter.build_from(
_("Score Threshold"),
"score_threshold",
float,
description=_(
"The score threshold, if score of candidate is less than it, it "
"will be filtered."
),
optional=True,
default=0.3,
),
Parameter.build_from(
_("Query Rewrite"),
"query_rewrite",
QueryRewrite,
description=_("The query rewrite resource."),
optional=True,
default=None,
),
Parameter.build_from(
_("Rerank"),
"rerank",
Ranker,
description=_("The rerank."),
optional=True,
default=None,
),
],
inputs=[
IOField.build_from(
_("Query"),
"query",
str,
description=_("The query to retrieve."),
)
],
outputs=[
IOField.build_from(
_("Candidates"),
"candidates",
Chunk,
description=_("The retrieved candidates."),
is_list=True,
)
],
)
def __init__(
self,
vector_store_connector: VectorStoreConnector,
@@ -53,12 +120,52 @@ class EmbeddingRetrieverOperator(RetrieverOperator[Union[str, List[str]], List[C
class EmbeddingAssemblerOperator(AssemblerOperator[Knowledge, List[Chunk]]):
"""The Embedding Assembler Operator."""
metadata = ViewMetadata(
label=_("Embedding Assembler Operator"),
name="embedding_assembler_operator",
description=_("Load knowledge and assemble embedding chunks to vector store."),
category=OperatorCategory.RAG,
parameters=[
Parameter.build_from(
_("Vector Store Connector"),
"vector_store_connector",
VectorStoreConnector,
description=_("The vector store connector."),
),
Parameter.build_from(
_("Chunk Parameters"),
"chunk_parameters",
ChunkParameters,
description=_("The chunk parameters."),
optional=True,
default=None,
),
],
inputs=[
IOField.build_from(
_("Knowledge"),
"knowledge",
Knowledge,
description=_("The knowledge to be loaded."),
)
],
outputs=[
IOField.build_from(
_("Chunks"),
"chunks",
Chunk,
description=_(
"The assembled chunks, it has been persisted to vector " "store."
),
is_list=True,
)
],
)
def __init__(
self,
vector_store_connector: VectorStoreConnector,
chunk_parameters: Optional[ChunkParameters] = ChunkParameters(
chunk_strategy="CHUNK_BY_SIZE"
),
chunk_parameters: Optional[ChunkParameters] = None,
**kwargs
):
"""Create a new EmbeddingAssemblerOperator.
@@ -68,6 +175,8 @@ class EmbeddingAssemblerOperator(AssemblerOperator[Knowledge, List[Chunk]]):
chunk_parameters (Optional[ChunkParameters], optional): The chunk
parameters. Defaults to ChunkParameters(chunk_strategy="CHUNK_BY_SIZE").
"""
if not chunk_parameters:
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
self._chunk_parameters = chunk_parameters
self._vector_store_connector = vector_store_connector
super().__init__(**kwargs)

View File

@@ -1,7 +1,8 @@
"""Knowledge Operator."""
from typing import Optional
from typing import List, Optional
from dbgpt.core import Chunk
from dbgpt.core.awel import MapOperator
from dbgpt.core.awel.flow import (
IOField,
@@ -12,44 +13,47 @@ from dbgpt.core.awel.flow import (
)
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.factory import KnowledgeFactory
from dbgpt.util.i18n_utils import _
class KnowledgeOperator(MapOperator[str, Knowledge]):
"""Knowledge Factory Operator."""
metadata = ViewMetadata(
label="Knowledge Factory Operator",
label=_("Knowledge Operator"),
name="knowledge_operator",
category=OperatorCategory.RAG,
description="The knowledge operator.",
description=_(
_("The knowledge operator, which can create knowledge from datasource.")
),
inputs=[
IOField.build_from(
"knowledge datasource",
_("knowledge datasource"),
"knowledge datasource",
str,
"knowledge datasource",
_("knowledge datasource, which can be a document, url, or text."),
)
],
outputs=[
IOField.build_from(
"Knowledge",
_("Knowledge"),
"Knowledge",
Knowledge,
description="Knowledge",
description=_("Knowledge object."),
)
],
parameters=[
Parameter.build_from(
label="datasource",
label=_("Default datasource"),
name="datasource",
type=str,
optional=True,
default="DOCUMENT",
description="datasource",
default=None,
description=_("Default datasource."),
),
Parameter.build_from(
label="knowledge_type",
name="knowledge type",
label=_("Knowledge type"),
name="knowledge_type",
type=str,
optional=True,
options=[
@@ -64,7 +68,7 @@ class KnowledgeOperator(MapOperator[str, Knowledge]):
),
],
default=KnowledgeType.DOCUMENT.name,
description="knowledge type",
description=_("Knowledge type."),
),
],
documentation_url="https://github.com/openai/openai-python",
@@ -92,3 +96,50 @@ class KnowledgeOperator(MapOperator[str, Knowledge]):
return await self.blocking_func_to_async(
KnowledgeFactory.create, datasource, self._knowledge_type
)
class ChunksToStringOperator(MapOperator[List[Chunk], str]):
"""The Chunks To String Operator."""
metadata = ViewMetadata(
label=_("Chunks To String Operator"),
name="chunks_to_string_operator",
description=_("Convert chunks to string."),
category=OperatorCategory.RAG,
parameters=[
Parameter.build_from(
_("Separator"),
"separator",
str,
description=_("The separator between chunks."),
optional=True,
default="\n",
)
],
inputs=[
IOField.build_from(
_("Chunks"),
"chunks",
Chunk,
description=_("The input chunks."),
is_list=True,
)
],
outputs=[
IOField.build_from(
_("String"),
"string",
str,
description=_("The output string."),
)
],
)
def __init__(self, separator: str = "\n", **kwargs):
"""Create a new ChunksToStringOperator."""
self._separator = separator
super().__init__(**kwargs)
async def map(self, chunks: List[Chunk]) -> str:
"""Map the chunks to string."""
return self._separator.join([chunk.content for chunk in chunks])

View File

@@ -6,58 +6,61 @@ from dbgpt.core import LLMClient
from dbgpt.core.awel import MapOperator
from dbgpt.core.awel.flow import IOField, OperatorCategory, Parameter, ViewMetadata
from dbgpt.rag.retriever.rewrite import QueryRewrite
from dbgpt.util.i18n_utils import _
class QueryRewriteOperator(MapOperator[dict, Any]):
"""The Rewrite Operator."""
metadata = ViewMetadata(
label="Query Rewrite Operator",
label=_("Query Rewrite Operator"),
name="query_rewrite_operator",
category=OperatorCategory.RAG,
description="query rewrite operator.",
description=_("Query rewrite operator."),
inputs=[
IOField.build_from("query_context", "query_context", dict, "query context")
IOField.build_from(
_("Query context"), "query_context", dict, _("query context")
)
],
outputs=[
IOField.build_from(
"rewritten queries",
_("Rewritten queries"),
"queries",
str,
is_list=True,
description="rewritten queries",
description=_("Rewritten queries"),
)
],
parameters=[
Parameter.build_from(
"LLM Client",
_("LLM Client"),
"llm_client",
LLMClient,
description="The LLM Client.",
description=_("The LLM Client."),
),
Parameter.build_from(
label="model name",
label=_("Model name"),
name="model_name",
type=str,
optional=True,
default="gpt-3.5-turbo",
description="llm model name",
description=_("LLM model name."),
),
Parameter.build_from(
label="prompt language",
label=_("Prompt language"),
name="language",
type=str,
optional=True,
default="en",
description="prompt language",
description=_("Prompt language."),
),
Parameter.build_from(
label="nums",
label=_("Number of results"),
name="nums",
type=int,
optional=True,
default=5,
description="rewrite query nums",
description=_("rewrite query number."),
),
],
documentation_url="https://github.com/openai/openai-python",

View File

@@ -7,24 +7,25 @@ from dbgpt.core.awel.flow import IOField, OperatorCategory, Parameter, ViewMetad
from dbgpt.rag.assembler.summary import SummaryAssembler
from dbgpt.rag.knowledge.base import Knowledge
from dbgpt.rag.operators.assembler import AssemblerOperator
from dbgpt.util.i18n_utils import _
class SummaryAssemblerOperator(AssemblerOperator[Any, Any]):
"""The summary assembler operator."""
metadata = ViewMetadata(
label="Summary Operator",
label=_("Summary Operator"),
name="summary_assembler_operator",
category=OperatorCategory.RAG,
description="The summary assembler operator.",
description=_("The summary assembler operator."),
inputs=[
IOField.build_from(
"Knowledge", "knowledge", Knowledge, "knowledge datasource"
_("Knowledge"), "knowledge", Knowledge, _("Knowledge datasource")
)
],
outputs=[
IOField.build_from(
"document summary",
_("Document summary"),
"summary",
str,
description="document summary",
@@ -32,44 +33,44 @@ class SummaryAssemblerOperator(AssemblerOperator[Any, Any]):
],
parameters=[
Parameter.build_from(
"LLM Client",
_("LLM Client"),
"llm_client",
LLMClient,
optional=True,
default=None,
description="The LLM Client.",
description=_("The LLM Client."),
),
Parameter.build_from(
label="model name",
label=_("Model name"),
name="model_name",
type=str,
optional=True,
default="gpt-3.5-turbo",
description="llm model name",
description=_("LLM model name"),
),
Parameter.build_from(
label="prompt language",
label=_("prompt language"),
name="language",
type=str,
optional=True,
default="en",
description="prompt language",
description=_("prompt language"),
),
Parameter.build_from(
label="max_iteration_with_llm",
label=_("Max iteration with LLM"),
name="max_iteration_with_llm",
type=int,
optional=True,
default=5,
description="prompt language",
description=_("prompt language"),
),
Parameter.build_from(
label="concurrency_limit_with_llm",
label=_("Concurrency limit with LLM"),
name="concurrency_limit_with_llm",
type=int,
optional=True,
default=3,
description="The concurrency limit with llm",
description=_("The concurrency limit with llm"),
),
],
documentation_url="https://github.com/openai/openai-python",