From 6a8ee918344fd959595b442766522bc80a40faae Mon Sep 17 00:00:00 2001 From: xuyuan23 <643854343@qq.com> Date: Tue, 13 Jun 2023 15:58:24 +0800 Subject: [PATCH 1/5] add plugin_env file, define plugin config strategy. --- .gitignore | 1 + .plugin_env.template | 14 ++++++++++++++ pilot/configs/__init__.py | 1 + 3 files changed, 16 insertions(+) create mode 100644 .plugin_env.template diff --git a/.gitignore b/.gitignore index 82fa7fe62..faebd410a 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,4 @@ pilot/nltk_data logswebserver.log.* .history/* +.plugin_env \ No newline at end of file diff --git a/.plugin_env.template b/.plugin_env.template new file mode 100644 index 000000000..92d73a1bf --- /dev/null +++ b/.plugin_env.template @@ -0,0 +1,14 @@ +#################################################################################### +## [DB-GPT-Bytebase-Plugin] ### +##################################################################################### +HOST_NAME={your-host-ip, to execute command operate} +HOST_USER=root +HOST_PASSWORD={your-host-password} +SSH_PORT=22 + +BYTE_BASE_COOKIE={your-bytebase-cookie} +BYTE_BASE_DOMAIN={your-bytebase-server-address} +BYTE_BASE_DEFAULT_DEV_INSTANCE=mysql_dev +BYTE_BASE_DEFAULT_TEST_INSTANCE=mysql_test +BYTE_BASE_DEFAULT_PROD_INSTANCE=mysql_prod +DEFAULT_PROJECT_NAME={your-default-project} diff --git a/pilot/configs/__init__.py b/pilot/configs/__init__.py index 909f8bf4b..44f901bbd 100644 --- a/pilot/configs/__init__.py +++ b/pilot/configs/__init__.py @@ -10,5 +10,6 @@ if "pytest" in sys.argv or "pytest" in sys.modules or os.getenv("CI"): # Load the users .env file into environment variables load_dotenv(verbose=True, override=True) +load_dotenv(".plugin_env") del load_dotenv From 24457dc286b60c3a4ad4b854bc8726ab1bcdef5a Mon Sep 17 00:00:00 2001 From: xuyuan23 <643854343@qq.com> Date: Tue, 13 Jun 2023 17:42:11 +0800 Subject: [PATCH 2/5] reformat file proxy_llm.py --- pilot/model/llm_out/proxy_llm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pilot/model/llm_out/proxy_llm.py b/pilot/model/llm_out/proxy_llm.py index 6dd1bfc2b..e0ec78dd9 100644 --- a/pilot/model/llm_out/proxy_llm.py +++ b/pilot/model/llm_out/proxy_llm.py @@ -66,7 +66,7 @@ def proxyllm_generate_stream(model, tokenizer, params, device, context_len=2048) "messages": history, "temperature": params.get("temperature"), "max_tokens": params.get("max_new_tokens"), - "stream": True + "stream": True, } res = requests.post( @@ -76,12 +76,12 @@ def proxyllm_generate_stream(model, tokenizer, params, device, context_len=2048) text = "" for line in res.iter_lines(): if line: - json_data = line.split(b': ', 1)[1] + json_data = line.split(b": ", 1)[1] decoded_line = json_data.decode("utf-8") - if decoded_line.lower() != '[DONE]'.lower(): + if decoded_line.lower() != "[DONE]".lower(): obj = json.loads(json_data) - if obj['choices'][0]['delta'].get('content') is not None: - content = obj['choices'][0]['delta']['content'] + if obj["choices"][0]["delta"].get("content") is not None: + content = obj["choices"][0]["delta"]["content"] text += content yield text @@ -104,4 +104,4 @@ def proxyllm_generate_stream(model, tokenizer, params, device, context_len=2048) # json_line = json.loads(decoded_line) # print(json_line) # text += json_line["choices"][0]["message"]["content"] - # yield text \ No newline at end of file + # yield text From acc2c5806c01ca8626bd3ec636f6d097b14ed730 Mon Sep 17 00:00:00 2001 From: xuyuan23 <643854343@qq.com> Date: Tue, 13 Jun 2023 17:45:49 +0800 Subject: [PATCH 3/5] remove comment code --- pilot/model/llm_out/proxy_llm.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pilot/model/llm_out/proxy_llm.py b/pilot/model/llm_out/proxy_llm.py index e0ec78dd9..717311778 100644 --- a/pilot/model/llm_out/proxy_llm.py +++ b/pilot/model/llm_out/proxy_llm.py @@ -84,24 +84,3 @@ def proxyllm_generate_stream(model, tokenizer, params, device, context_len=2048) content = obj["choices"][0]["delta"]["content"] text += content yield text - - # native result. - # payloads = { - # "model": "gpt-3.5-turbo", # just for test, remove this later - # "messages": history, - # "temperature": params.get("temperature"), - # "max_tokens": params.get("max_new_tokens"), - # } - # - # res = requests.post( - # CFG.proxy_server_url, headers=headers, json=payloads, stream=True - # ) - # - # text = "" - # line = res.content - # if line: - # decoded_line = line.decode("utf-8") - # json_line = json.loads(decoded_line) - # print(json_line) - # text += json_line["choices"][0]["message"]["content"] - # yield text From 669e5cc3dcdef8edb9ae5d7ae494c7b015235b7b Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Tue, 13 Jun 2023 19:20:19 +0800 Subject: [PATCH 4/5] update:db chat --- README.md | 2 +- docs/getting_started/concepts.md | 2 - docs/getting_started/tutorials.md | 13 ++++- .../LC_MESSAGES/getting_started/tutorials.po | 58 ++++++++++++++----- .../zh_CN/LC_MESSAGES/modules/knownledge.po | 50 +++++++++------- pilot/language/lang_content_mapping.py | 2 +- 6 files changed, 87 insertions(+), 40 deletions(-) delete mode 100644 docs/getting_started/concepts.md diff --git a/README.md b/README.md index 46c433528..d35a50e1c 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Currently, we have released multiple key features, which are listed below to dem - Automatic execution of SQL and retrieval of query results - Automatic crawling and learning of knowledge - Unified vector storage/indexing of knowledge base - - Support for unstructured data such as PDF, Markdown, CSV, and WebURL + - Support for unstructured data such as PDF, TXT, Markdown, CSV, DOC, PPT, and WebURL - Milti LLMs Support - Supports multiple large language models, currently supporting Vicuna (7b, 13b), ChatGLM-6b (int4, int8), guanaco(7b,13b,33b), Gorilla(7b,13b) diff --git a/docs/getting_started/concepts.md b/docs/getting_started/concepts.md deleted file mode 100644 index e834417d3..000000000 --- a/docs/getting_started/concepts.md +++ /dev/null @@ -1,2 +0,0 @@ -# Concepts - diff --git a/docs/getting_started/tutorials.md b/docs/getting_started/tutorials.md index 99ac15a51..4c2245996 100644 --- a/docs/getting_started/tutorials.md +++ b/docs/getting_started/tutorials.md @@ -3,6 +3,8 @@ This is a collection of DB-GPT tutorials on Medium. +DB-GPT is divided into several functions, including chat with knowledge base, execute SQL, chat with database, and execute plugins. + ###Introduce [What is DB-GPT](https://www.youtube.com/watch?v=QszhVJerc0I) by csunny (https://github.com/csunny/DB-GPT): @@ -12,5 +14,12 @@ This is a collection of DB-GPT tutorials on Medium. [Add new Knowledge demonstration](../../assets/new_knownledge_en.gif) -### DB Plugins -[db plugins demonstration](../../assets/auto_sql_en.gif) \ No newline at end of file +### SQL Generation +[sql generation demonstration](../../assets/demo_en.gif) + +### SQL Execute +[sql execute demonstration](../../assets/auto_sql_en.gif) + + +### Plugins +[db plugins demonstration](../../assets/dbgpt_bytebase_plugin.gif) \ No newline at end of file diff --git a/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po b/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po index 989630ab1..b00cd631a 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po +++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-13 11:38+0800\n" +"POT-Creation-Date: 2023-06-13 18:04+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -27,33 +27,65 @@ msgstr "教程" msgid "This is a collection of DB-GPT tutorials on Medium." msgstr "这是知乎上DB-GPT教程的集合。." -#: ../../getting_started/tutorials.md:6 3915395cc45742519bf0c607eeafc489 +#: ../../getting_started/tutorials.md:6 1c8db33581ea4928905e029a98b9a155 +msgid "" +"DB-GPT is divided into several functions, including chat with knowledge " +"base, execute SQL, chat with database, and execute plugins." +msgstr "" + +#: ../../getting_started/tutorials.md:8 3915395cc45742519bf0c607eeafc489 +#, fuzzy msgid "" "###Introduce [What is DB-" "GPT](https://www.youtube.com/watch?v=QszhVJerc0I) by csunny " -"(https://github.com/csunny/DB-GPT)" -msgstr "###Introduce [什么是DB-GPT](https://www.bilibili.com/video/BV1SM4y1a7Nj/?buvid=551b023900b290f9497610b2155a2668&is_story_h5=false&mid=%2BVyE%2Fwau5woPcUKieCWS0A%3D%3D&p=1&plat_id=116&share_from=ugc&share_medium=iphone&share_plat=ios&share_session_id=5D08B533-82A4-4D40-9615-7826065B4574&share_source=GENERIC&share_tag=s_i×tamp=1686307943&unique_k=bhO3lgQ&up_id=31375446) by csunny (https://github.com/csunny/DB-GPT)" +"(https://github.com/csunny/DB-GPT):" +msgstr "" +"###Introduce [什么是DB-" +"GPT](https://www.bilibili.com/video/BV1SM4y1a7Nj/?buvid=551b023900b290f9497610b2155a2668&is_story_h5=false&mid=%2BVyE%2Fwau5woPcUKieCWS0A%3D%3D&p=1&plat_id=116&share_from=ugc&share_medium=iphone&share_plat=ios&share_session_id=5D08B533-82A4-4D40-9615-7826065B4574&share_source=GENERIC&share_tag=s_i×tamp=1686307943&unique_k=bhO3lgQ&up_id=31375446)" +" by csunny (https://github.com/csunny/DB-GPT)" -#: ../../getting_started/tutorials.md:9 e213736923574b2cb039a457d789c27c +#: ../../getting_started/tutorials.md:11 e213736923574b2cb039a457d789c27c msgid "Knowledge" msgstr "知识库" -#: ../../getting_started/tutorials.md:11 90b5472735a644168d51c054ed882748 +#: ../../getting_started/tutorials.md:13 90b5472735a644168d51c054ed882748 msgid "" "[How to Create your own knowledge repository](https://db-" "gpt.readthedocs.io/en/latest/modules/knownledge.html)" -msgstr "[怎么创建自己的知识库](https://db-" +msgstr "" +"[怎么创建自己的知识库](https://db-" "gpt.readthedocs.io/en/latest/modules/knownledge.html)" -#: ../../getting_started/tutorials.md:13 6a851e1e88ea4bcbaf7ee742a12224ef +#: ../../getting_started/tutorials.md:15 6a851e1e88ea4bcbaf7ee742a12224ef msgid "[Add new Knowledge demonstration](../../assets/new_knownledge_en.gif)" msgstr "[新增知识库演示](../../assets/new_knownledge_en.gif)" -#: ../../getting_started/tutorials.md:15 4487ef393e004e7c936f5104727212a4 -msgid "DB Plugins" +#: ../../getting_started/tutorials.md:17 59887be89d8046e28956f909fcbbc9dc +msgid "SQL Generation" +msgstr "" + +#: ../../getting_started/tutorials.md:18 ee5decd8441d40ae8a240a19c1a5a74a +#, fuzzy +msgid "[sql generation demonstration](../../assets/demo_en.gif)" +msgstr "[sql生成演示](../../assets/demo_en.gif)" + +#: ../../getting_started/tutorials.md:20 5d25c5d307c24c9198f2b52e70f2421c +msgid "SQL Execute" +msgstr "SQL执行" + +#: ../../getting_started/tutorials.md:21 ee5decd8441d40ae8a240a19c1a5a74a +#, fuzzy +msgid "[sql execute demonstration](../../assets/auto_sql_en.gif)" +msgstr "[sql execute 演示](../../assets/auto_sql_en.gif)" + + +#: ../../getting_started/tutorials.md:26 4487ef393e004e7c936f5104727212a4 +#, fuzzy +msgid "Plugins" msgstr "DB Plugins" -#: ../../getting_started/tutorials.md:16 ee5decd8441d40ae8a240a19c1a5a74a -msgid "[db plugins demonstration](../../assets/auto_sql_en.gif)" -msgstr "[db plugins 演示](../../assets/auto_sql_en.gif)" +#: ../../getting_started/tutorials.md:27 ee5decd8441d40ae8a240a19c1a5a74a +#, fuzzy +msgid "[db plugins demonstration](../../assets/dbgpt_bytebase_plugin.gif)" +msgstr "[db plugins 演示](../../assets/dbgpt_bytebase_plugin.gif)" diff --git a/docs/locales/zh_CN/LC_MESSAGES/modules/knownledge.po b/docs/locales/zh_CN/LC_MESSAGES/modules/knownledge.po index fd37a7565..d85aba16d 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/modules/knownledge.po +++ b/docs/locales/zh_CN/LC_MESSAGES/modules/knownledge.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-11 14:10+0800\n" +"POT-Creation-Date: 2023-06-13 18:04+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,73 +17,81 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.11.0\n" +"Generated-By: Babel 2.12.1\n" -#: ../../modules/knownledge.md:1 ac3aa55568c0414a821a42aeed509ab2 +#: ../../modules/knownledge.md:1 8c5aad32a2cc4c97bc988a1f4143097b msgid "Knownledge" msgstr "知识" -#: ../../modules/knownledge.md:3 1d57e3d2d790437ea54730477c67fdfb +#: ../../modules/knownledge.md:3 d739696a9e6240c78db3906d55329636 msgid "" "As the knowledge base is currently the most significant user demand " "scenario, we natively support the construction and processing of " "knowledge bases. At the same time, we also provide multiple knowledge " "base management strategies in this project, such as:" -msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。" -"同时,我们还在本项目中提供了多种知识库管理策略,如:" +msgstr "由于知识库是当前用户需求最显著的场景,我们原生支持知识库的构建和处理。同时,我们还在本项目中提供了多种知识库管理策略,如:" -#: ../../modules/knownledge.md:4 784708fc19334742b73549d92a21ed32 +#: ../../modules/knownledge.md:4 16e03ee1cd454786a736b6960c668c3a msgid "Default built-in knowledge base" msgstr "默认内置知识库" -#: ../../modules/knownledge.md:5 c65ccfabe79348c09e6fc13a10774ffd +#: ../../modules/knownledge.md:5 b5c57a8c773b40d18e344862adf7790e msgid "Custom addition of knowledge bases" msgstr "自定义新增知识库" -#: ../../modules/knownledge.md:6 fc8fded3e3634edfbe6001d9ea1add90 +#: ../../modules/knownledge.md:6 7c05adacafe34780a73fa2bc6748f92f msgid "" "Various usage scenarios such as constructing knowledge bases through " "plugin capabilities and web crawling. Users only need to organize the " "knowledge documents, and they can use our existing capabilities to build " "the knowledge base required for the large model." -msgstr "各种使用场景,例如通过插件功能和爬虫构建知识库。用户只需要组织知识文档," -"并且他们可以使用我们现有的功能来构建大型模型所需的知识库。" +msgstr "各种使用场景,例如通过插件功能和爬虫构建知识库。用户只需要组织知识文档,并且他们可以使用我们现有的功能来构建大型模型所需的知识库。" -#: ../../modules/knownledge.md:9 2fa8ae0edeef4380ab60c43754d93c93 +#: ../../modules/knownledge.md:9 8b196a2a9efb435baf648a99d89e1220 msgid "Create your own knowledge repository" msgstr "创建你自己的知识库" -#: ../../modules/knownledge.md:11 13dc4cea806e42c4887c45bbd84fb063 +#: ../../modules/knownledge.md:11 370071fde98c4c59bb18735364602adf msgid "" "1.Place personal knowledge files or folders in the pilot/datasets " "directory." msgstr "1.将个人知识文件或文件夹放在pilot/datasets目录中。" -#: ../../modules/knownledge.md:13 8dbf51249c9d47749e3fedbf9886479b +#: ../../modules/knownledge.md:13 5ac32a1253c4433e87d64dccb2c8b600 +msgid "" +"We currently support many document formats: txt, pdf, md, html, doc, ppt," +" and url." +msgstr "当前支持txt, pdf, md, doc, ppt, html文档格式" + +#: ../../modules/knownledge.md:15 1782a135e84f4e9f8cb090f8af935428 +msgid "before execution:" +msgstr "在执行之前" + +#: ../../modules/knownledge.md:22 43791873b7e043239e160790bbfc10e1 msgid "" "2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma " "(now only support Chroma and Milvus, if you set Milvus, please set " "MILVUS_URL and MILVUS_PORT)" -msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持" -"Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)" +msgstr "2.更新你的.env,设置你的向量存储类型,VECTOR_STORE_TYPE=Chroma(现在只支持Chroma和Milvus,如果你设置了Milvus,请设置MILVUS_URL和MILVUS_PORT)" -#: ../../modules/knownledge.md:16 e03cce8ad3b14100b8bb22dd98ea49ae +#: ../../modules/knownledge.md:25 197e043db45e444e9c930f29be808f31 msgid "2.Run the knowledge repository script in the tools directory." msgstr "2.在tools目录执行知识入库脚本" -#: ../../modules/knownledge.md:26 a2919580cc324820b1217e31c8b22203 +#: ../../modules/knownledge.md:34 abeb77ed400c4838b2ca8e14dcd89b29 msgid "" "3.Add the knowledge repository in the interface by entering the name of " "your knowledge repository (if not specified, enter \"default\") so you " "can use it for Q&A based on your knowledge base." msgstr "如果选择新增知识库,在界面上新增知识库输入你的知识库名" -#: ../../modules/knownledge.md:28 236317becbb042f2acbf66c499a3b984 +#: ../../modules/knownledge.md:36 dcff9efafd9d441b91c1389af2a49780 msgid "" "Note that the default vector model used is text2vec-large-chinese (which " "is a large model, so if your personal computer configuration is not " "enough, it is recommended to use text2vec-base-chinese). Therefore, " "ensure that you download the model and place it in the models directory." -msgstr "注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑" -"配置不够建议采用text2vec-base-chinese),因此确保需要将模型download下来放到models目录中。" +msgstr "" +"注意,这里默认向量模型是text2vec-large-chinese(模型比较大,如果个人电脑配置不够建议采用text2vec-base-" +"chinese),因此确保需要将模型download下来放到models目录中。" diff --git a/pilot/language/lang_content_mapping.py b/pilot/language/lang_content_mapping.py index e2ea8b4cc..bccc01224 100644 --- a/pilot/language/lang_content_mapping.py +++ b/pilot/language/lang_content_mapping.py @@ -44,7 +44,7 @@ lang_dicts = { "learn_more_markdown": "The service is a research preview intended for non-commercial use only. subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of Vicuna-13B", "model_control_param": "Model Parameters", "sql_generate_mode_direct": "Execute directly", - "sql_generate_mode_none": "chat to db", + "sql_generate_mode_none": "db chat", "max_input_token_size": "Maximum output token size", "please_choose_database": "Please choose database", "sql_generate_diagnostics": "SQL Generation & Diagnostics", From 38ca7c1cb207b9bdf3667e3654e06eecc15256eb Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Tue, 13 Jun 2023 19:50:10 +0800 Subject: [PATCH 5/5] fix:knowledge init --- pilot/scene/chat_knowledge/default/chat.py | 17 +++++++++++------ tools/knowlege_init.py | 1 - 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pilot/scene/chat_knowledge/default/chat.py b/pilot/scene/chat_knowledge/default/chat.py index 325b03783..3f21b828d 100644 --- a/pilot/scene/chat_knowledge/default/chat.py +++ b/pilot/scene/chat_knowledge/default/chat.py @@ -1,3 +1,5 @@ +from chromadb.errors import NoIndexException + from pilot.scene.base_chat import BaseChat, logger, headers from pilot.scene.base import ChatScene from pilot.common.sql_database import Database @@ -46,12 +48,15 @@ class ChatDefaultKnowledge(BaseChat): ) def generate_input_values(self): - docs = self.knowledge_embedding_client.similar_search( - self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE - ) - context = [d.page_content for d in docs] - context = context[:2000] - input_values = {"context": context, "question": self.current_user_input} + try: + docs = self.knowledge_embedding_client.similar_search( + self.current_user_input, CFG.KNOWLEDGE_SEARCH_TOP_SIZE + ) + context = [d.page_content for d in docs] + context = context[:2000] + input_values = {"context": context, "question": self.current_user_input} + except NoIndexException: + raise ValueError("you have no default knowledge store, please execute python knowledge_init.py") return input_values def do_with_prompt_response(self, prompt_response): diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py index 26338df1c..c9a0c5457 100644 --- a/tools/knowlege_init.py +++ b/tools/knowlege_init.py @@ -43,7 +43,6 @@ if __name__ == "__main__": parser.add_argument("--vector_name", type=str, default="default") args = parser.parse_args() vector_name = args.vector_name - append_mode = args.append store_type = CFG.VECTOR_STORE_TYPE vector_store_config = {"vector_store_name": vector_name} print(vector_store_config)