fix(ChatData):db metadata charset bug (#698)

1.fix db metadata charset bug
2.fmt
This commit is contained in:
FangYin Cheng 2023-10-19 21:23:07 +08:00 committed by GitHub
commit 9efc4d3167
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 371 additions and 82 deletions

View File

@ -97,4 +97,10 @@ pip install langchain>=0.0.286
```commandline
pip install --use-pep517 fschat
```
```
##### Q9: alembic.util.exc.CommandError: Target database is not up to date.
delete files in `DB-GPT/pilot/meta_data/alembic/versions/` and reboot.
```commandline
rm -rf DB-GPT/pilot/meta_data/alembic/versions/*
```

View File

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: DB-GPT 👏👏 0.3.5\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-09-26 17:47+0800\n"
"POT-Creation-Date: 2023-10-19 19:31+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
@ -20,12 +20,12 @@ msgstr ""
"Generated-By: Babel 2.12.1\n"
#: ../../getting_started/faq/deploy/deploy_faq.md:1
#: ca823e9d6d1d433db7ed15c8273e1b00
#: fb640f7c38744cbf996dcf7f73f325f6
msgid "Installation FAQ"
msgstr "Installation FAQ"
#: ../../getting_started/faq/deploy/deploy_faq.md:5
#: 3803d098c534434f9f513b3a62de54a4
#: 79fd80e469d14d608554d53a0e0ed2e3
#, fuzzy
msgid ""
"Q1: execute `pip install -e .` error, found some package cannot find "
@ -35,18 +35,18 @@ msgstr ""
"cannot find correct version."
#: ../../getting_started/faq/deploy/deploy_faq.md:6
#: b785864f47e643df9a4669d8da6167d6
#: f1f6e3291d1446b5bbcf744cd4c4e89a
msgid "change the pip source."
msgstr "替换pip源."
#: ../../getting_started/faq/deploy/deploy_faq.md:13
#: ../../getting_started/faq/deploy/deploy_faq.md:20
#: c41f026fb1464c71a45d0746c224ecce f70fb69b568d4fc4ad4c4731b2032eaf
#: 68e1b39a08774a81b9061cc5205e4c1c dd34901f446749e998cd34ec5b6c44f4
msgid "or"
msgstr "或者"
#: ../../getting_started/faq/deploy/deploy_faq.md:27
#: d179e3d695764f838dc354eb0d978bb3
#: 0899f0e28dae443b8f912d96c797b79c
msgid ""
"Q2: sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) unable to"
" open database file"
@ -55,86 +55,97 @@ msgstr ""
" open database file"
#: ../../getting_started/faq/deploy/deploy_faq.md:29
#: 55174e8d247a414e8c6c8861d4707a55
#: 3e60d8190e49436b8c40b34a67b7bfb3
msgid "make sure you pull latest code or create directory with mkdir pilot/data"
msgstr "make sure you pull latest code or create directory with mkdir pilot/data"
#: ../../getting_started/faq/deploy/deploy_faq.md:31
#: dbce9e9cae734a5083a6f0fc28bce7cd
#: baeaae20238842d3b8e4ae5b337198e5
msgid "Q3: The model keeps getting killed."
msgstr "Q3: The model keeps getting killed."
#: ../../getting_started/faq/deploy/deploy_faq.md:33
#: 2de5648d2e7546bf85f20f4162003298
#: eb3936307ad64b19b73483ff9ae126f2
msgid ""
"your GPU VRAM size is not enough, try replace your hardware or replace "
"other llms."
msgstr "GPU显存不够, 增加显存或者换一个显存小的模型"
#: ../../getting_started/faq/deploy/deploy_faq.md:35
#: 47810771cd364964b9b5b8fd85bca4ee
#: f6dba770717041699c73b4cd00d48aad
msgid "Q4: How to access website on the public network"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:37
#: e8c5bac6680648509d528ea6aaf5994e
#: 447d9e9374de44bab6d8a03f2c936676
msgid ""
"You can try to use gradio's [network](https://github.com/gradio-"
"app/gradio/blob/main/gradio/networking.py) to achieve."
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:48
#: bb75ec127f574c00a09d92d5206e9357
#: 5e34dd4dfcf34feeb1815dfa974041d0
msgid "Open `url` with your browser to see the website."
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:50
#: 5fdb87b84bd94385a1a93dab8d41ebe8
#: aaef774ce6124021a3862bc0a25d465f
msgid "Q5: (Windows) execute `pip install -e .` error"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:52
#: 31eef51e044044f29f3ad08defa9c305
#: ec3945df451c4ec2b32ebb476f45c82b
msgid "The error log like the following:"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:71
#: aaba0c3060b443e4b9877f70d78321ce
#: 1df09f6d9f9b4c1a8a32d6e271e5ee39
msgid ""
"Download and install `Microsoft C++ Build Tools` from [visual-cpp-build-"
"tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:75
#: 4c8137546e5c4240884f7ea6d9d922bf
#: 251f47bfa5694242a1c9d81a2022b7a0
msgid "Q6: `Torch not compiled with CUDA enabled`"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:82
#: 01daf14f8c494219b1d9a5af4449951e
#: bc9dfdfc47924a0e8d3ec535e23bf923
msgid "Install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive)"
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:83
#: c75e6371911e4d5ca6859e51501c9679
#: b5a632baa42745bdbee5d6ba516d8d8b
msgid ""
"Reinstall PyTorch [start-locally](https://pytorch.org/get-started/locally"
"/#start-locally) with CUDA support."
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:85
#: 7cfb9003e505445ebb9ed3d015e184e2
#: 0092fb91642749f5a55b629017c0de6a
msgid "Q7: ImportError: cannot import name 'PersistentClient' from 'chromadb'."
msgstr "Q7: ImportError: cannot import name 'PersistentClient' from 'chromadb'."
#: ../../getting_started/faq/deploy/deploy_faq.md:91
#: e1d5d5d85ddc480d8d81f7b550848cbf
#: 4aa87418f2a54c138bf3b7ff28a7e776
msgid ""
"Q8: pydantic.error_wrappers.ValidationError:1 validation error for "
"HuggingFaceEmbeddings.model_kwargs extra not permitted"
msgstr "Q8: pydantic.error_wrappers.ValidationError:1 validation error for "
msgstr ""
"Q8: pydantic.error_wrappers.ValidationError:1 validation error for "
"HuggingFaceEmbeddings.model_kwargs extra not permitted"
#: ../../getting_started/faq/deploy/deploy_faq.md:102
#: 6b690ab272af44f6b126cfe5ce1435ef
msgid "Q9: alembic.util.exc.CommandError: Target database is not up to date."
msgstr ""
#: ../../getting_started/faq/deploy/deploy_faq.md:103
#: 223026d3b9124363b695937922d8f8d5
msgid "delete files in `DB-GPT/pilot/meta_data/alembic/versions/` and reboot."
msgstr "删除`DB-GPT/pilot/meta_data/alembic/versions/`目录下文件"
#~ msgid ""
#~ "Q2: When use Mysql, Access denied "
#~ "for user 'root@localhost'(using password :NO)"

View File

@ -9,7 +9,10 @@ from pilot.base_modules.meta_data.meta_data import Base, engine, session
class MyPluginEntity(Base):
__tablename__ = "my_plugin"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True, comment="autoincrement id")
tenant = Column(String(255), nullable=True, comment="user's tenant")
user_code = Column(String(255), nullable=False, comment="user code")
@ -27,7 +30,7 @@ class MyPluginEntity(Base):
created_at = Column(
DateTime, default=datetime.utcnow, comment="plugin install time"
)
__table_args__ = (UniqueConstraint("user_code", "name", name="uk_name"),)
UniqueConstraint("user_code", "name", name="uk_name")
class MyPluginDao(BaseDao[MyPluginEntity]):

View File

@ -1,7 +1,7 @@
from datetime import datetime
import pytz
from typing import List
from sqlalchemy import Column, Integer, String, Index, DateTime, func, Boolean
from sqlalchemy import Column, Integer, String, Index, DateTime, func, Boolean, DDL
from sqlalchemy import UniqueConstraint
from pilot.base_modules.meta_data.meta_data import Base
@ -9,8 +9,15 @@ from pilot.base_modules.meta_data.base_dao import BaseDao
from pilot.base_modules.meta_data.meta_data import Base, engine, session
char_set_sql = DDL("ALTER TABLE plugin_hub CONVERT TO CHARACTER SET utf8mb4")
class PluginHubEntity(Base):
__tablename__ = "plugin_hub"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(
Integer, primary_key=True, autoincrement=True, comment="autoincrement id"
)
@ -26,10 +33,8 @@ class PluginHubEntity(Base):
created_at = Column(DateTime, default=datetime.utcnow, comment="plugin upload time")
installed = Column(Integer, default=False, comment="plugin already installed count")
__table_args__ = (
UniqueConstraint("name", name="uk_name"),
Index("idx_q_type", "type"),
)
UniqueConstraint("name", name="uk_name")
Index("idx_q_type", "type")
class PluginHubDao(BaseDao[PluginHubEntity]):

View File

@ -52,7 +52,18 @@ class RDBMSDatabase(BaseConnect):
custom_table_info: Optional[dict] = None,
view_support: bool = False,
):
"""Create engine from database URI."""
"""Create engine from database URI.
Args:
- engine: Engine sqlalchemy.engine
- schema: Optional[str].
- metadata: Optional[MetaData]
- ignore_tables: Optional[List[str]]
- include_tables: Optional[List[str]]
- sample_rows_in_table_info: int default:3,
- indexes_in_table_info: bool = False,
- custom_table_info: Optional[dict] = None,
- view_support: bool = False,
"""
self._engine = engine
self._schema = schema
if include_tables and ignore_tables:
@ -92,6 +103,15 @@ class RDBMSDatabase(BaseConnect):
engine_args: Optional[dict] = None,
**kwargs: Any,
) -> RDBMSDatabase:
"""Construct a SQLAlchemy engine from uri database.
Args:
host (str): database host.
port (int): database port.
user (str): database user.
pwd (str): database password.
db_name (str): database name.
engine_args (Optional[dict]):other engine_args.
"""
db_url: str = (
cls.driver
+ "://"

View File

@ -21,7 +21,13 @@ class CSVEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize with csv path."""
"""Initialize with csv path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path, vector_store_config, source_reader=None, text_splitter=None
)

View File

@ -28,7 +28,16 @@ class EmbeddingEngine:
text_splitter: Optional[TextSplitter] = None,
embedding_factory: EmbeddingFactory = None,
):
"""Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source"""
"""Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source
Args:
- model_name: model_name
- vector_store_config: vector store config: Dict
- knowledge_type: Optional[KnowledgeType]
- knowledge_source: Optional[str]
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
- embedding_factory: EmbeddingFactory
"""
self.knowledge_source = knowledge_source
self.model_name = model_name
self.vector_store_config = vector_store_config
@ -65,6 +74,11 @@ class EmbeddingEngine:
)
def similar_search(self, text, topk):
"""vector db similar search
Args:
- text: query text
- topk: top k
"""
vector_client = VectorStoreConnector(
self.vector_store_config["vector_store_type"], self.vector_store_config
)
@ -75,12 +89,17 @@ class EmbeddingEngine:
return ans
def vector_exist(self):
"""vector db is exist"""
vector_client = VectorStoreConnector(
self.vector_store_config["vector_store_type"], self.vector_store_config
)
return vector_client.vector_name_exists()
def delete_by_ids(self, ids):
"""delete vector db by ids
Args:
- ids: vector ids
"""
vector_client = VectorStoreConnector(
self.vector_store_config["vector_store_type"], self.vector_store_config
)

View File

@ -23,7 +23,13 @@ class PDFEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize pdf word path."""
"""Initialize pdf word path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path, vector_store_config, source_reader=None, text_splitter=None
)

View File

@ -23,7 +23,13 @@ class PPTEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize ppt word path."""
"""Initialize ppt word path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path, vector_store_config, source_reader=None, text_splitter=None
)

View File

@ -29,7 +29,14 @@ class SourceEmbedding(ABC):
text_splitter: Optional[TextSplitter] = None,
embedding_args: Optional[Dict] = None,
):
"""Initialize with Loader url, model_name, vector_store_config"""
"""Initialize with Loader url, model_name, vector_store_config
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
- embedding_args: Optional
"""
self.file_path = file_path
self.vector_store_config = vector_store_config
self.source_reader = source_reader or None
@ -44,21 +51,33 @@ class SourceEmbedding(ABC):
@register
def data_process(self, text):
"""pre process data."""
"""pre process data.
Args:
- text: raw text
"""
@register
def text_splitter(self, text_splitter: TextSplitter):
"""add text split chunk"""
"""add text split chunk
Args:
- text_splitter: TextSplitter
"""
pass
@register
def text_to_vector(self, docs):
"""transform vector"""
"""transform vector
Args:
- docs: List[Document]
"""
pass
@register
def index_to_store(self, docs):
"""index to vector store"""
"""index to vector store
Args:
- docs: List[Document]
"""
self.vector_client = VectorStoreConnector(
self.vector_store_config["vector_store_type"], self.vector_store_config
)
@ -66,7 +85,10 @@ class SourceEmbedding(ABC):
@register
def similar_search(self, doc, topk):
"""vector store similarity_search"""
"""vector store similarity_search
Args:
- query: query
"""
self.vector_client = VectorStoreConnector(
self.vector_store_config["vector_store_type"], self.vector_store_config
)
@ -82,6 +104,7 @@ class SourceEmbedding(ABC):
return self.vector_client.vector_name_exists()
def source_embedding(self):
"""read()->data_process()->text_split()->index_to_store()"""
if "read" in registered_methods:
text = self.read()
if "data_process" in registered_methods:

View File

@ -20,7 +20,13 @@ class StringEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize raw text word path."""
"""Initialize raw text word path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path=file_path,
vector_store_config=vector_store_config,

View File

@ -22,7 +22,13 @@ class URLEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize url word path."""
"""Initialize url word path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path, vector_store_config, source_reader=None, text_splitter=None
)

View File

@ -23,7 +23,13 @@ class WordEmbedding(SourceEmbedding):
source_reader: Optional = None,
text_splitter: Optional[TextSplitter] = None,
):
"""Initialize with word path."""
"""Initialize with word path.
Args:
- file_path: data source path
- vector_store_config: vector store config params.
- source_reader: Optional[BaseLoader]
- text_splitter: Optional[TextSplitter]
"""
super().__init__(
file_path, vector_store_config, source_reader=None, text_splitter=None
)

View File

@ -10,6 +10,10 @@ class ChatHistoryEntity(Base):
id = Column(
Integer, primary_key=True, autoincrement=True, comment="autoincrement id"
)
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
conv_uid = Column(
String(255),
unique=False,
@ -21,12 +25,10 @@ class ChatHistoryEntity(Base):
user_name = Column(String(255), nullable=True, comment="interlocutor")
messages = Column(Text, nullable=True, comment="Conversation details")
__table_args__ = (
UniqueConstraint("conv_uid", name="uk_conversation"),
Index("idx_q_user", "user_name"),
Index("idx_q_mode", "chat_mode"),
Index("idx_q_conv", "summary"),
)
UniqueConstraint("conv_uid", name="uk_conversation")
Index("idx_q_user", "user_name")
Index("idx_q_mode", "chat_mode")
Index("idx_q_conv", "summary")
class ChatHistoryDao(BaseDao[ChatHistoryEntity]):

View File

@ -9,6 +9,10 @@ from pilot.openapi.api_v1.feedback.feed_back_model import FeedBackBody
class ChatFeedBackEntity(Base):
__tablename__ = "chat_feed_back"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True)
conv_uid = Column(String(128))
conv_index = Column(Integer)

View File

@ -21,6 +21,12 @@ CFG = Config()
class BaseChat(ABC):
"""DB-GPT Chat Service Base Module
Include:
stream_call():scene + prompt -> stream response
nostream_call():scene + prompt -> nostream response
"""
chat_scene: str = None
llm_model: Any = None
# By default, keep the last two rounds of conversation records as the context
@ -32,6 +38,14 @@ class BaseChat(ABC):
arbitrary_types_allowed = True
def __init__(self, chat_param: Dict):
"""Chat Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) select param
"""
self.chat_session_id = chat_param["chat_session_id"]
self.chat_mode = chat_param["chat_mode"]
self.current_user_input: str = chat_param["current_user_input"]

View File

@ -18,10 +18,20 @@ logger = logging.getLogger("chat_agent")
class ChatAgent(BaseChat):
"""Chat With Agent through plugin"""
chat_scene: str = ChatScene.ChatAgent.value()
chat_retention_rounds = 0
def __init__(self, chat_param: Dict):
"""Chat Agent Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) agent plugin
"""
if not chat_param["select_param"]:
raise ValueError("Please select a Plugin!")
self.select_plugins = chat_param["select_param"].split(",")

View File

@ -19,10 +19,17 @@ CFG = Config()
class ChatDashboard(BaseChat):
chat_scene: str = ChatScene.ChatDashboard.value()
report_name: str
"""Number of results to return from the query"""
"""Chat Dashboard to generate dashboard chart"""
def __init__(self, chat_param: Dict):
""" """
"""Chat Dashboard Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) dbname
"""
self.db_name = chat_param["select_param"]
chat_param["chat_mode"] = ChatScene.ChatDashboard
super().__init__(chat_param=chat_param)

View File

@ -19,10 +19,20 @@ CFG = Config()
class ChatExcel(BaseChat):
"""a Excel analyzer to analyze Excel Data"""
chat_scene: str = ChatScene.ChatExcel.value()
chat_retention_rounds = 1
def __init__(self, chat_param: Dict):
"""Chat Excel Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) file path
"""
chat_mode = ChatScene.ChatExcel
self.select_param = chat_param["select_param"]

View File

@ -15,6 +15,14 @@ class ChatWithDbAutoExecute(BaseChat):
"""Number of results to return from the query"""
def __init__(self, chat_param: Dict):
"""Chat Data Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) dbname
"""
chat_mode = ChatScene.ChatWithDbExecute
self.db_name = chat_param["select_param"]
chat_param["chat_mode"] = chat_mode
@ -31,6 +39,9 @@ class ChatWithDbAutoExecute(BaseChat):
self.top_k: int = 200
def generate_input_values(self):
"""
generate input values
"""
try:
from pilot.summary.db_summary_client import DBSummaryClient
except ImportError:

View File

@ -12,10 +12,17 @@ CFG = Config()
class ChatWithDbQA(BaseChat):
chat_scene: str = ChatScene.ChatWithDbQA.value()
"""Number of results to return from the query"""
"""As a DBA, Chat DB Module, chat with combine DB meta schema """
def __init__(self, chat_param: Dict):
""" """
"""Chat DB Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) dbname
"""
self.db_name = chat_param["select_param"]
chat_param["chat_mode"] = ChatScene.ChatWithDbQA
super().__init__(chat_param=chat_param)

View File

@ -11,11 +11,21 @@ CFG = Config()
class ChatWithPlugin(BaseChat):
"""Chat With Plugin"""
chat_scene: str = ChatScene.ChatExecution.value()
plugins_prompt_generator: PluginPromptGenerator
select_plugin: str = None
def __init__(self, chat_param: Dict):
"""Chat Dashboard Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) plugin selector
"""
self.plugin_selector = chat_param["select_param"]
chat_param["chat_mode"] = ChatScene.ChatExecution
super().__init__(chat_param=chat_param)

View File

@ -19,10 +19,17 @@ CFG = Config()
class ChatKnowledge(BaseChat):
chat_scene: str = ChatScene.ChatKnowledge.value()
"""Number of results to return from the query"""
"""KBQA Chat Module"""
def __init__(self, chat_param: Dict):
""" """
"""Chat Knowledge Module Initialization
Args:
- chat_param: Dict
- chat_session_id: (str) chat session_id
- current_user_input: (str) current user input
- model_name:(str) llm model name
- select_param:(str) space name
"""
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
from pilot.embedding_engine.embedding_factory import EmbeddingFactory

View File

@ -21,6 +21,7 @@ def signal_handler(sig, frame):
def async_db_summary(system_app: SystemApp):
"""async db schema into vector db"""
from pilot.summary.db_summary_client import DBSummaryClient
client = DBSummaryClient(system_app=system_app)

View File

@ -115,6 +115,9 @@ def _get_webserver_params(args: List[str] = None):
def initialize_app(param: WebWerverParameters = None, args: List[str] = None):
"""Initialize app
If you use gunicorn as a process manager, initialize_app can be invoke in `on_starting` hook.
Args:
param:WebWerverParameters
args:List[str]
"""
if not param:
param = _get_webserver_params(args)

View File

@ -12,6 +12,10 @@ CFG = Config()
class DocumentChunkEntity(Base):
__tablename__ = "document_chunk"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True)
document_id = Column(Integer)
doc_name = Column(String(100))

View File

@ -11,6 +11,10 @@ CFG = Config()
class KnowledgeDocumentEntity(Base):
__tablename__ = "knowledge_document"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True)
doc_name = Column(String(100))
doc_type = Column(String(100))
@ -24,6 +28,8 @@ class KnowledgeDocumentEntity(Base):
gmt_created = Column(DateTime)
gmt_modified = Column(DateTime)
__table_args__ = {"mysql_charset": "utf8mb4"}
def __repr__(self):
return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')"

View File

@ -57,12 +57,21 @@ class SyncStatus(Enum):
# @singleton
class KnowledgeService:
"""KnowledgeService
Knowledge Management Service:
-knowledge_space management
-knowledge_document management
-embedding management
"""
def __init__(self):
pass
"""create knowledge space"""
def create_knowledge_space(self, request: KnowledgeSpaceRequest):
"""create knowledge space
Args:
- request: KnowledgeSpaceRequest
"""
query = KnowledgeSpaceEntity(
name=request.name,
)
@ -72,9 +81,11 @@ class KnowledgeService:
knowledge_space_dao.create_knowledge_space(request)
return True
"""create knowledge document"""
def create_knowledge_document(self, space, request: KnowledgeDocumentRequest):
"""create knowledge document
Args:
- request: KnowledgeDocumentRequest
"""
query = KnowledgeDocumentEntity(doc_name=request.doc_name, space=space)
documents = knowledge_document_dao.get_knowledge_documents(query)
if len(documents) > 0:
@ -91,9 +102,11 @@ class KnowledgeService:
)
return knowledge_document_dao.create_knowledge_document(document)
"""get knowledge space"""
def get_knowledge_space(self, request: KnowledgeSpaceRequest):
"""get knowledge space
Args:
- request: KnowledgeSpaceRequest
"""
query = KnowledgeSpaceEntity(
name=request.name, vector_type=request.vector_type, owner=request.owner
)
@ -116,6 +129,10 @@ class KnowledgeService:
return responses
def arguments(self, space_name):
"""show knowledge space arguments
Args:
- space_name: Knowledge Space Name
"""
query = KnowledgeSpaceEntity(name=space_name)
spaces = knowledge_space_dao.get_knowledge_space(query)
if len(spaces) != 1:
@ -128,6 +145,11 @@ class KnowledgeService:
return json.loads(context)
def argument_save(self, space_name, argument_request: SpaceArgumentRequest):
"""save argument
Args:
- space_name: Knowledge Space Name
- argument_request: SpaceArgumentRequest
"""
query = KnowledgeSpaceEntity(name=space_name)
spaces = knowledge_space_dao.get_knowledge_space(query)
if len(spaces) != 1:
@ -136,9 +158,12 @@ class KnowledgeService:
space.context = argument_request.argument
return knowledge_space_dao.update_knowledge_space(space)
"""get knowledge get_knowledge_documents"""
def get_knowledge_documents(self, space, request: DocumentQueryRequest):
"""get knowledge documents
Args:
- space: Knowledge Space Name
- request: DocumentQueryRequest
"""
query = KnowledgeDocumentEntity(
doc_name=request.doc_name,
doc_type=request.doc_type,
@ -153,9 +178,12 @@ class KnowledgeService:
res.page = request.page
return res
"""sync knowledge document chunk into vector store"""
def sync_knowledge_document(self, space_name, sync_request: DocumentSyncRequest):
"""sync knowledge document chunk into vector store
Args:
- space: Knowledge Space Name
- sync_request: DocumentSyncRequest
"""
from pilot.embedding_engine.embedding_engine import EmbeddingEngine
from pilot.embedding_engine.embedding_factory import EmbeddingFactory
from pilot.embedding_engine.pre_text_splitter import PreTextSplitter
@ -249,11 +277,6 @@ class KnowledgeService:
doc.chunk_size = len(chunk_docs)
doc.gmt_modified = datetime.now()
knowledge_document_dao.update_knowledge_document(doc)
# async doc embeddings
# thread = threading.Thread(
# target=self.async_doc_embedding, args=(client, chunk_docs, doc)
# )
# thread.start()
executor = CFG.SYSTEM_APP.get_component(
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
).create()
@ -277,16 +300,21 @@ class KnowledgeService:
return True
"""update knowledge space"""
def update_knowledge_space(
self, space_id: int, space_request: KnowledgeSpaceRequest
):
"""update knowledge space
Args:
- space_id: space id
- space_request: KnowledgeSpaceRequest
"""
knowledge_space_dao.update_knowledge_space(space_id, space_request)
"""delete knowledge space"""
def delete_space(self, space_name: str):
"""delete knowledge space
Args:
- space_name: knowledge space name
"""
query = KnowledgeSpaceEntity(name=space_name)
spaces = knowledge_space_dao.get_knowledge_space(query)
if len(spaces) == 0:
@ -312,6 +340,11 @@ class KnowledgeService:
return knowledge_space_dao.delete_knowledge_space(space)
def delete_document(self, space_name: str, doc_name: str):
"""delete document
Args:
- space_name: knowledge space name
- doc_name: doocument name
"""
document_query = KnowledgeDocumentEntity(doc_name=doc_name, space=space_name)
documents = knowledge_document_dao.get_documents(document_query)
if len(documents) != 1:
@ -332,9 +365,11 @@ class KnowledgeService:
# delete document
return knowledge_document_dao.delete(document_query)
"""get document chunks"""
def get_document_chunks(self, request: ChunkQueryRequest):
"""get document chunks
Args:
- request: ChunkQueryRequest
"""
query = DocumentChunkEntity(
id=request.id,
document_id=request.document_id,
@ -350,6 +385,12 @@ class KnowledgeService:
return res
def async_doc_embedding(self, client, chunk_docs, doc):
"""async document embedding into vector db
Args:
- client: EmbeddingEngine Client
- chunk_docs: List[Document]
- doc: doc
"""
logger.info(
f"async_doc_embedding, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to vector store-{CFG.VECTOR_STORE_TYPE}"
)
@ -391,6 +432,10 @@ class KnowledgeService:
return context_template_string
def get_space_context(self, space_name):
"""get space contect
Args:
- space_name: space name
"""
request = KnowledgeSpaceRequest()
request.name = space_name
spaces = self.get_knowledge_space(request)

View File

@ -12,6 +12,10 @@ CFG = Config()
class KnowledgeSpaceEntity(Base):
__tablename__ = "knowledge_space"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True)
name = Column(String(100))
vector_type = Column(String(100))

View File

@ -16,6 +16,9 @@ CFG = Config()
model_path = LLM_MODEL_CONFIG.get(CFG.LLM_MODEL)
if __name__ == "__main__":
"""run llm server including controller, manager worker
If you use gunicorn as a process manager, initialize_app can be invoke in `on_starting` hook.
"""
run_worker_manager(
model_name=CFG.LLM_MODEL,
model_path=model_path,

View File

@ -13,6 +13,10 @@ CFG = Config()
class PromptManageEntity(Base):
__tablename__ = "prompt_manage"
__table_args__ = {
"mysql_charset": "utf8mb4",
"mysql_collate": "utf8mb4_unicode_ci",
}
id = Column(Integer, primary_key=True)
chat_scene = Column(String(100))
sub_chat_scene = Column(String(100))

View File

@ -14,7 +14,11 @@ class VectorStoreConnector:
"""
def __init__(self, vector_store_type, ctx: {}) -> None:
"""initialize vector store connector."""
"""initialize vector store connector.
Args:
- vector_store_type: vector store type Milvus, Chroma, Weaviate
- ctx: vector store config params.
"""
self.ctx = ctx
self._register()
@ -30,20 +34,30 @@ class VectorStoreConnector:
"""load document in vector database."""
return self.client.load_document(docs)
def similar_search(self, docs, topk):
"""similar search in vector database."""
return self.client.similar_search(docs, topk)
def similar_search(self, doc: str, topk: int):
"""similar search in vector database.
Args:
- doc: query text
- topk: topk
"""
return self.client.similar_search(doc, topk)
def vector_name_exists(self):
"""is vector store name exist."""
return self.client.vector_name_exists()
def delete_vector_name(self, vector_name):
"""vector store delete"""
"""vector store delete
Args:
- vector_name: vector store name
"""
return self.client.delete_vector_name(vector_name)
def delete_by_ids(self, ids):
"""vector store delete by ids."""
"""vector store delete by ids.
Args:
- ids: vector ids
"""
return self.client.delete_by_ids(ids=ids)
def _match(self, vector_store_type) -> bool: