diff --git a/README.md b/README.md index 3a7d670cb..46c433528 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,15 @@ In the .env configuration file, modify the LANGUAGE parameter to switch between 1.Place personal knowledge files or folders in the pilot/datasets directory. +We currently support many document formats: txt, pdf, md, html, doc, ppt, and url. + +before execution: + +``` +python -m spacy download zh_core_web_sm + +``` + 2.set .env configuration set your vector store type, eg:VECTOR_STORE_TYPE=Chroma, now we support Chroma and Milvus(version > 2.1) 3.Run the knowledge repository script in the tools directory. diff --git a/README.zh.md b/README.zh.md index af58478a1..cbfb67ac0 100644 --- a/README.zh.md +++ b/README.zh.md @@ -18,6 +18,8 @@ DB-GPT 是一个开源的以数据库为基础的GPT实验项目,使用本地化的GPT大模型与您的数据和环境进行交互,无数据泄露风险,100% 私密,100% 安全。 +[DB-GPT视频介绍](https://www.bilibili.com/video/BV1SM4y1a7Nj/?buvid=551b023900b290f9497610b2155a2668&is_story_h5=false&mid=%2BVyE%2Fwau5woPcUKieCWS0A%3D%3D&p=1&plat_id=116&share_from=ugc&share_medium=iphone&share_plat=ios&share_session_id=5D08B533-82A4-4D40-9615-7826065B4574&share_source=GENERIC&share_tag=s_i×tamp=1686307943&unique_k=bhO3lgQ&up_id=31375446) + ## 最新发布 - [2023/06/01]🔥 在Vicuna-13B基础模型的基础上,通过插件实现任务链调用。例如单句创建数据库的实现.[演示](./assets/dbgpt_bytebase_plugin.gif) @@ -174,6 +176,15 @@ $ python webserver.py 1.将个人知识文件或者文件夹放入pilot/datasets目录中 +当前支持的文档格式: txt, pdf, md, html, doc, ppt, and url. + +在操作之前先执行 + +``` +python -m spacy download zh_core_web_sm + +``` + 2.在.env文件指定你的向量数据库类型,VECTOR_STORE_TYPE(默认Chroma),目前支持Chroma,Milvus(需要设置MILVUS_URL和MILVUS_PORT) 注意Milvus版本需要>2.1 diff --git a/docs/getting_started/tutorials.md b/docs/getting_started/tutorials.md index 9583cda90..99ac15a51 100644 --- a/docs/getting_started/tutorials.md +++ b/docs/getting_started/tutorials.md @@ -3,4 +3,14 @@ This is a collection of DB-GPT tutorials on Medium. -Comming soon... \ No newline at end of file +###Introduce +[What is DB-GPT](https://www.youtube.com/watch?v=QszhVJerc0I) by csunny (https://github.com/csunny/DB-GPT): + +### Knowledge + +[How to Create your own knowledge repository](https://db-gpt.readthedocs.io/en/latest/modules/knownledge.html) + +[Add new Knowledge demonstration](../../assets/new_knownledge_en.gif) + +### DB Plugins +[db plugins demonstration](../../assets/auto_sql_en.gif) \ No newline at end of file diff --git a/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po b/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po index eef93efd5..989630ab1 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po +++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/tutorials.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-11 14:10+0800\n" +"POT-Creation-Date: 2023-06-13 11:38+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,17 +17,43 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.11.0\n" +"Generated-By: Babel 2.12.1\n" -#: ../../getting_started/tutorials.md:1 12b03941d64f4bdf96eaaeec0147a387 +#: ../../getting_started/tutorials.md:1 7011a2ab0e7f45ddb1fa85b6479cc442 msgid "Tutorials" msgstr "教程" -#: ../../getting_started/tutorials.md:4 b966c15b01f94a1e84d4b6142b8f4111 +#: ../../getting_started/tutorials.md:4 960f88b9c1b64940bfa0576bab5b0314 msgid "This is a collection of DB-GPT tutorials on Medium." msgstr "这是知乎上DB-GPT教程的集合。." -#: ../../getting_started/tutorials.md:6 869431aac3864180acb41b852d48d29e -msgid "Comming soon..." -msgstr "未完待续" +#: ../../getting_started/tutorials.md:6 3915395cc45742519bf0c607eeafc489 +msgid "" +"###Introduce [What is DB-" +"GPT](https://www.youtube.com/watch?v=QszhVJerc0I) by csunny " +"(https://github.com/csunny/DB-GPT)" +msgstr "###Introduce [什么是DB-GPT](https://www.bilibili.com/video/BV1SM4y1a7Nj/?buvid=551b023900b290f9497610b2155a2668&is_story_h5=false&mid=%2BVyE%2Fwau5woPcUKieCWS0A%3D%3D&p=1&plat_id=116&share_from=ugc&share_medium=iphone&share_plat=ios&share_session_id=5D08B533-82A4-4D40-9615-7826065B4574&share_source=GENERIC&share_tag=s_i×tamp=1686307943&unique_k=bhO3lgQ&up_id=31375446) by csunny (https://github.com/csunny/DB-GPT)" + +#: ../../getting_started/tutorials.md:9 e213736923574b2cb039a457d789c27c +msgid "Knowledge" +msgstr "知识库" + +#: ../../getting_started/tutorials.md:11 90b5472735a644168d51c054ed882748 +msgid "" +"[How to Create your own knowledge repository](https://db-" +"gpt.readthedocs.io/en/latest/modules/knownledge.html)" +msgstr "[怎么创建自己的知识库](https://db-" +"gpt.readthedocs.io/en/latest/modules/knownledge.html)" + +#: ../../getting_started/tutorials.md:13 6a851e1e88ea4bcbaf7ee742a12224ef +msgid "[Add new Knowledge demonstration](../../assets/new_knownledge_en.gif)" +msgstr "[新增知识库演示](../../assets/new_knownledge_en.gif)" + +#: ../../getting_started/tutorials.md:15 4487ef393e004e7c936f5104727212a4 +msgid "DB Plugins" +msgstr "DB Plugins" + +#: ../../getting_started/tutorials.md:16 ee5decd8441d40ae8a240a19c1a5a74a +msgid "[db plugins demonstration](../../assets/auto_sql_en.gif)" +msgstr "[db plugins 演示](../../assets/auto_sql_en.gif)" diff --git a/docs/locales/zh_CN/LC_MESSAGES/modules/llms.po b/docs/locales/zh_CN/LC_MESSAGES/modules/llms.po index 08fd5d984..bbb05b046 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/modules/llms.po +++ b/docs/locales/zh_CN/LC_MESSAGES/modules/llms.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-11 14:10+0800\n" +"POT-Creation-Date: 2023-06-13 11:38+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,13 +17,13 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.11.0\n" +"Generated-By: Babel 2.12.1\n" -#: ../../modules/llms.md:1 9c05a511436b4a408e2d1acd2f2568e7 +#: ../../modules/llms.md:1 34386f3fecba48fbbd86718283ba593c msgid "LLMs" msgstr "大语言模型" -#: ../../modules/llms.md:3 c6549cbde17e42e596470a537286cedb +#: ../../modules/llms.md:3 241b39ad980f4cfd90a7f0fdae05a1d2 #, python-format msgid "" "In the underlying large model integration, we have designed an open " @@ -34,23 +34,64 @@ msgid "" " of 85% or higher. We use higher standards to select models, hoping to " "save users the cumbersome testing and evaluation process in the process " "of use." -msgstr "在底层大模型接入中,我们设计了开放的接口,支持对接多种大模型。同时对于接入模型的效果," -"我们有非常严格的把控与评审机制。对大模型能力上与ChatGPT对比,在准确率上需要满足85%" -"以上的能力对齐。我们用更高的标准筛选模型,是期望在用户使用过程中,可以省去前面繁琐的测试评估环节。" +msgstr "在底层大模型接入中,我们设计了开放的接口,支持对接多种大模型。同时对于接入模型的效果,我们有非常严格的把控与评审机制。对大模型能力上与ChatGPT对比,在准确率上需要满足85%以上的能力对齐。我们用更高的标准筛选模型,是期望在用户使用过程中,可以省去前面繁琐的测试评估环节。" -#: ../../modules/llms.md:5 1b18ef91924442f7ab7a117aec6122d5 +#: ../../modules/llms.md:5 25175e87a62e41bca86798eb783cefd6 msgid "Multi LLMs Usage" msgstr "多模型使用" -#: ../../modules/llms.md:6 b14256f1768d45ef929be664b8afb31e +#: ../../modules/llms.md:6 8c35341e9ca94202ba779567813f9973 msgid "" "To use multiple models, modify the LLM_MODEL parameter in the .env " "configuration file to switch between the models." msgstr "如果要使用不同的模型,请修改.env配置文件中的LLM MODEL参数以在模型之间切换。" -#: ../../modules/llms.md:8 42cbe90a1a524d8381a0a743ef1a927e +#: ../../modules/llms.md:8 2edf3309a6554f39ad74e19faff09cee msgid "" "Notice: you can create .env file from .env.template, just use command " "like this:" msgstr "注意:你可以从 .env.template 创建 .env 文件。只需使用如下命令:" +#: ../../modules/llms.md:14 5fa7639ef294425e89e13b7c6617fb4b +msgid "" +"now we support models vicuna-13b, vicuna-7b, chatglm-6b, flan-t5-base, " +"guanaco-33b-merged, falcon-40b, gorilla-7b." +msgstr "现在我们支持的模型有vicuna-13b, vicuna-7b, chatglm-6b, flan-t5-base, " +"guanaco-33b-merged, falcon-40b, gorilla-7b." + +#: ../../modules/llms.md:16 96c9a5ad00264bd2a07bdbdec87e471e +msgid "" +"DB-GPT provides a model load adapter and chat adapter. load adapter which" +" allows you to easily adapt load different LLM models by inheriting the " +"BaseLLMAdapter. You just implement match() and loader() method." +msgstr "DB-GPT提供了多模型适配器load adapter和chat adapter.load adapter通过继承BaseLLMAdapter类, 实现match和loader方法允许你适配不同的LLM." + +#: ../../modules/llms.md:18 1033714691464f50900c04c9e1bb5643 +msgid "vicuna llm load adapter" +msgstr "vicuna llm load adapter" + +#: ../../modules/llms.md:35 faa6432575be45bcae5deb1cc7fee3fb +msgid "chatglm load adapter" +msgstr "chatglm load adapter" + +#: ../../modules/llms.md:62 61c4189cabf04e628132c2bf5f02bb50 +msgid "" +"chat adapter which allows you to easily adapt chat different LLM models " +"by inheriting the BaseChatAdpter.you just implement match() and " +"get_generate_stream_func() method" +msgstr "chat adapter通过继承BaseChatAdpter允许你通过实现match和get_generate_stream_func方法允许你适配不同的LLM." + +#: ../../modules/llms.md:64 407a67e4e2c6414b9cde346961d850c0 +msgid "vicuna llm chat adapter" +msgstr "vicuna llm chat adapter" + +#: ../../modules/llms.md:76 53a55238cd90406db58c50dc64465195 +msgid "chatglm llm chat adapter" +msgstr "chatglm llm chat adapter" + +#: ../../modules/llms.md:89 b0c5ff72c05e40b3b301d6b81205fe63 +msgid "" +"if you want to integrate your own model, just need to inheriting " +"BaseLLMAdaper and BaseChatAdpter and implement the methods" +msgstr "如果你想集成自己的模型,只需要继承BaseLLMAdaper和BaseChatAdpter类,然后实现里面的方法即可" + diff --git a/docs/locales/zh_CN/LC_MESSAGES/use_cases/knownledge_based_qa.po b/docs/locales/zh_CN/LC_MESSAGES/use_cases/knownledge_based_qa.po index 26473de7f..00acd9ff2 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/use_cases/knownledge_based_qa.po +++ b/docs/locales/zh_CN/LC_MESSAGES/use_cases/knownledge_based_qa.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 0.1.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-06-11 14:10+0800\n" +"POT-Creation-Date: 2023-06-13 11:38+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,13 +17,13 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.11.0\n" +"Generated-By: Babel 2.12.1\n" -#: ../../use_cases/knownledge_based_qa.md:1 a03c7a5aa5cc4a3e9bc7bd3734d47176 +#: ../../use_cases/knownledge_based_qa.md:1 ddfe412b92e14324bdc11ffe58114e5f msgid "Knownledge based qa" msgstr "知识问答" -#: ../../use_cases/knownledge_based_qa.md:3 37607733852c4ade97c80fbcca66d573 +#: ../../use_cases/knownledge_based_qa.md:3 48635316cc704a779089ff7b5cb9a836 msgid "" "Chat with your own knowledge is a very interesting thing. In the usage " "scenarios of this chapter, we will introduce how to build your own " @@ -33,25 +33,26 @@ msgid "" "base, which was introduced in the previous knowledge base module. Of " "course, you can also call our provided knowledge embedding API to store " "knowledge." -msgstr "用自己的知识聊天是一件很有趣的事情。在本章的使用场景中," -"我们将介绍如何通过知识库API构建自己的知识库。首先," -"构建知识存储目前可以通过执行“python tool/knowledge_init.py”" -"来初始化您自己的知识库的内容,这在前面的知识库模块中已经介绍过了" -"。当然,你也可以调用我们提供的知识嵌入API来存储知识。" +msgstr "" +"用自己的知识聊天是一件很有趣的事情。在本章的使用场景中,我们将介绍如何通过知识库API构建自己的知识库。首先,构建知识存储目前可以通过执行“python" +" " +"tool/knowledge_init.py”来初始化您自己的知识库的内容,这在前面的知识库模块中已经介绍过了。当然,你也可以调用我们提供的知识嵌入API来存储知识。" -#: ../../use_cases/knownledge_based_qa.md:6 ea5ad6cec29d49228c03d57d255c42fe -msgid "We currently support four document formats: txt, pdf, url, and md." +#: ../../use_cases/knownledge_based_qa.md:6 0a5c68429c9343cf8b88f4f1dddb18eb +#, fuzzy +msgid "" +"We currently support many document formats: txt, pdf, md, html, doc, ppt," +" and url." msgstr "“我们目前支持四种文件格式: txt, pdf, url, 和md。" -#: ../../use_cases/knownledge_based_qa.md:20 01908d4b18b345908004a251462d42b3 +#: ../../use_cases/knownledge_based_qa.md:20 83f3544c06954e5cbc0cc7788f699eb1 msgid "" "Now we currently support vector databases: Chroma (default) and Milvus. " "You can switch between them by modifying the \"VECTOR_STORE_TYPE\" field " "in the .env file." -msgstr "“我们目前支持向量数据库:Chroma(默认)和Milvus。" -"你可以通过修改.env文件中的“VECTOR_STORE_TYPE”参数在它们之间切换。" +msgstr "“我们目前支持向量数据库:Chroma(默认)和Milvus。你可以通过修改.env文件中的“VECTOR_STORE_TYPE”参数在它们之间切换。" -#: ../../use_cases/knownledge_based_qa.md:31 f37d80faa3f84c8cb176a59f4ff8140c +#: ../../use_cases/knownledge_based_qa.md:31 ac12f26b81384fc4bf44ccce1c0d86b4 msgid "Below is an example of using the knowledge base API to query knowledge:" msgstr "下面是一个使用知识库API进行查询的例子:" diff --git a/docs/modules/knownledge.md b/docs/modules/knownledge.md index 32a22acf8..c108920b2 100644 --- a/docs/modules/knownledge.md +++ b/docs/modules/knownledge.md @@ -10,6 +10,15 @@ As the knowledge base is currently the most significant user demand scenario, we 1.Place personal knowledge files or folders in the pilot/datasets directory. +We currently support many document formats: txt, pdf, md, html, doc, ppt, and url. + +before execution: + +``` +python -m spacy download zh_core_web_sm + +``` + 2.Update your .env, set your vector store type, VECTOR_STORE_TYPE=Chroma (now only support Chroma and Milvus, if you set Milvus, please set MILVUS_URL and MILVUS_PORT) @@ -19,7 +28,6 @@ As the knowledge base is currently the most significant user demand scenario, we python tools/knowledge_init.py --vector_name : your vector store name default_value:default ---append: append mode, True:append, False: not append default_value:False ``` diff --git a/docs/modules/llms.md b/docs/modules/llms.md index b4d57579f..c83b73af8 100644 --- a/docs/modules/llms.md +++ b/docs/modules/llms.md @@ -8,4 +8,82 @@ To use multiple models, modify the LLM_MODEL parameter in the .env configuration Notice: you can create .env file from .env.template, just use command like this: ``` cp .env.template .env -``` \ No newline at end of file +LLM_MODEL=vicuna-13b +MODEL_SERVER=http://127.0.0.1:8000 +``` +now we support models vicuna-13b, vicuna-7b, chatglm-6b, flan-t5-base, guanaco-33b-merged, falcon-40b, gorilla-7b. + +DB-GPT provides a model load adapter and chat adapter. load adapter which allows you to easily adapt load different LLM models by inheriting the BaseLLMAdapter. You just implement match() and loader() method. + +vicuna llm load adapter + +``` +class VicunaLLMAdapater(BaseLLMAdaper): + """Vicuna Adapter""" + + def match(self, model_path: str): + return "vicuna" in model_path + + def loader(self, model_path: str, from_pretrained_kwagrs: dict): + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + model = AutoModelForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, **from_pretrained_kwagrs + ) + return model, tokenizer +``` + +chatglm load adapter +``` + +class ChatGLMAdapater(BaseLLMAdaper): + """LLM Adatpter for THUDM/chatglm-6b""" + + def match(self, model_path: str): + return "chatglm" in model_path + + def loader(self, model_path: str, from_pretrained_kwargs: dict): + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + if DEVICE != "cuda": + model = AutoModel.from_pretrained( + model_path, trust_remote_code=True, **from_pretrained_kwargs + ).float() + return model, tokenizer + else: + model = ( + AutoModel.from_pretrained( + model_path, trust_remote_code=True, **from_pretrained_kwargs + ) + .half() + .cuda() + ) + return model, tokenizer +``` +chat adapter which allows you to easily adapt chat different LLM models by inheriting the BaseChatAdpter.you just implement match() and get_generate_stream_func() method + +vicuna llm chat adapter +``` +class VicunaChatAdapter(BaseChatAdpter): + """Model chat Adapter for vicuna""" + + def match(self, model_path: str): + return "vicuna" in model_path + + def get_generate_stream_func(self): + return generate_stream +``` + +chatglm llm chat adapter +``` +class ChatGLMChatAdapter(BaseChatAdpter): + """Model chat Adapter for ChatGLM""" + + def match(self, model_path: str): + return "chatglm" in model_path + + def get_generate_stream_func(self): + from pilot.model.llm_out.chatglm_llm import chatglm_generate_stream + + return chatglm_generate_stream +``` + if you want to integrate your own model, just need to inheriting BaseLLMAdaper and BaseChatAdpter and implement the methods \ No newline at end of file diff --git a/docs/use_cases/knownledge_based_qa.md b/docs/use_cases/knownledge_based_qa.md index dfd0d345d..3a357aaad 100644 --- a/docs/use_cases/knownledge_based_qa.md +++ b/docs/use_cases/knownledge_based_qa.md @@ -3,7 +3,7 @@ Chat with your own knowledge is a very interesting thing. In the usage scenarios of this chapter, we will introduce how to build your own knowledge base through the knowledge base API. Firstly, building a knowledge store can currently be initialized by executing "python tool/knowledge_init.py" to initialize the content of your own knowledge base, which was introduced in the previous knowledge base module. Of course, you can also call our provided knowledge embedding API to store knowledge. -We currently support four document formats: txt, pdf, url, and md. +We currently support many document formats: txt, pdf, md, html, doc, ppt, and url. ``` vector_store_config = { "vector_store_name": name @@ -11,7 +11,7 @@ vector_store_config = { file_path = "your file path" -knowledge_embedding_client = KnowledgeEmbedding(file_path=file_path, model_name=LLM_MODEL_CONFIG["text2vec"],local_persist=False, vector_store_config=vector_store_config) +knowledge_embedding_client = KnowledgeEmbedding(file_path=file_path, model_name=LLM_MODEL_CONFIG["text2vec"], vector_store_config=vector_store_config) knowledge_embedding_client.knowledge_embedding() @@ -37,7 +37,7 @@ vector_store_config = { query = "your query" -knowledge_embedding_client = KnowledgeEmbedding(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"], local_persist=False, vector_store_config=vector_store_config) +knowledge_embedding_client = KnowledgeEmbedding(file_path="", model_name=LLM_MODEL_CONFIG["text2vec"], vector_store_config=vector_store_config) knowledge_embedding_client.similar_search(query, 10) ``` \ No newline at end of file diff --git a/pilot/common/sql_database.py b/pilot/common/sql_database.py index bc3aa8340..f7dbd7164 100644 --- a/pilot/common/sql_database.py +++ b/pilot/common/sql_database.py @@ -443,6 +443,14 @@ class Database: indexes = cursor.fetchall() return [(index[2], index[4]) for index in indexes] + def get_show_create_table(self, table_name): + """Get table show create table about specified table.""" + session = self._db_sessions() + cursor = session.execute(text(f"SHOW CREATE TABLE {table_name}")) + ans = cursor.fetchall() + return ans[0][1] + + def get_fields(self, table_name): """Get column fields about specified table.""" session = self._db_sessions() diff --git a/pilot/language/lang_content_mapping.py b/pilot/language/lang_content_mapping.py index afcfaeaba..e2ea8b4cc 100644 --- a/pilot/language/lang_content_mapping.py +++ b/pilot/language/lang_content_mapping.py @@ -7,7 +7,7 @@ lang_dicts = { "learn_more_markdown": "该服务是仅供非商业用途的研究预览。受 Vicuna-13B 模型 [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) 的约束", "model_control_param": "模型参数", "sql_generate_mode_direct": "直接执行结果", - "sql_generate_mode_none": "不直接执行结果", + "sql_generate_mode_none": "db问答", "max_input_token_size": "最大输出Token数", "please_choose_database": "请选择数据", "sql_generate_diagnostics": "SQL生成与诊断", @@ -44,7 +44,7 @@ lang_dicts = { "learn_more_markdown": "The service is a research preview intended for non-commercial use only. subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of Vicuna-13B", "model_control_param": "Model Parameters", "sql_generate_mode_direct": "Execute directly", - "sql_generate_mode_none": "Execute without mode", + "sql_generate_mode_none": "chat to db", "max_input_token_size": "Maximum output token size", "please_choose_database": "Please choose database", "sql_generate_diagnostics": "SQL Generation & Diagnostics", diff --git a/pilot/scene/chat_db/professional_qa/chat.py b/pilot/scene/chat_db/professional_qa/chat.py index cb2425ea9..e956bdc8b 100644 --- a/pilot/scene/chat_db/professional_qa/chat.py +++ b/pilot/scene/chat_db/professional_qa/chat.py @@ -52,7 +52,7 @@ class ChatWithDbQA(BaseChat): raise ValueError("Could not import DBSummaryClient. ") if self.db_name: client = DBSummaryClient() - table_info = client.get_similar_tables( + table_info = client.get_db_summary( dbname=self.db_name, query=self.current_user_input, topk=self.top_k ) # table_info = self.database.table_simple_info(self.db_connect) @@ -60,8 +60,8 @@ class ChatWithDbQA(BaseChat): input_values = { "input": self.current_user_input, - "top_k": str(self.top_k), - "dialect": dialect, + # "top_k": str(self.top_k), + # "dialect": dialect, "table_info": table_info, } return input_values diff --git a/pilot/scene/chat_db/professional_qa/prompt.py b/pilot/scene/chat_db/professional_qa/prompt.py index 9cc35b2e4..ff360cb65 100644 --- a/pilot/scene/chat_db/professional_qa/prompt.py +++ b/pilot/scene/chat_db/professional_qa/prompt.py @@ -10,22 +10,44 @@ CFG = Config() PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. """ -PROMPT_SUFFIX = """Only use the following tables generate sql if have any table info: +# PROMPT_SUFFIX = """Only use the following tables generate sql if have any table info: +# {table_info} +# +# Question: {input} +# +# """ + +# _DEFAULT_TEMPLATE = """ +# You are a SQL expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer. +# Unless the user specifies in his question a specific number of examples he wishes to obtain, always limit your query to at most {top_k} results. +# You can order the results by a relevant column to return the most interesting examples in the database. +# Never query for all the columns from a specific table, only ask for a the few relevant columns given the question. +# Pay attention to use only the column names that you can see in the schema description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. +# +# """ + +_DEFAULT_TEMPLATE_EN = """ +You are a database expert. you will be given metadata information about a database or table, and then provide a brief summary and answer to the question. For example, question: "How many tables are there in database 'db_gpt'?" , answer: "There are 5 tables in database 'db_gpt', which are 'book', 'book_category', 'borrower', 'borrowing', and 'category'. +Based on the database metadata information below, provide users with professional and concise answers to their questions. If the answer cannot be obtained from the provided content, please say: "The information provided in the knowledge base is not sufficient to answer this question." It is forbidden to make up information randomly. +database metadata information: {table_info} - -Question: {input} - +question: +{input} """ -_DEFAULT_TEMPLATE = """ -You are a SQL expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer. -Unless the user specifies in his question a specific number of examples he wishes to obtain, always limit your query to at most {top_k} results. -You can order the results by a relevant column to return the most interesting examples in the database. -Never query for all the columns from a specific table, only ask for a the few relevant columns given the question. -Pay attention to use only the column names that you can see in the schema description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. - +_DEFAULT_TEMPLATE_ZH = """ +你是一位数据库专家。你将获得有关数据库或表的元数据信息,然后提供简要的总结和回答。例如,问题:“数据库 'db_gpt' 中有多少个表?” 答案:“数据库 'db_gpt' 中有 5 个表,分别是 'book'、'book_category'、'borrower'、'borrowing' 和 'category'。” +根据以下数据库元数据信息,为用户提供专业简洁的答案。如果无法从提供的内容中获取答案,请说:“知识库中提供的信息不足以回答此问题。” 禁止随意捏造信息。 +数据库元数据信息: +{table_info} +问题: +{input} """ +_DEFAULT_TEMPLATE = ( + _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == "en" else _DEFAULT_TEMPLATE_ZH +) + PROMPT_SEP = SeparatorStyle.SINGLE.value @@ -33,10 +55,10 @@ PROMPT_NEED_NEED_STREAM_OUT = True prompt = PromptTemplate( template_scene=ChatScene.ChatWithDbQA.value, - input_variables=["input", "table_info", "dialect", "top_k"], + input_variables=["input", "table_info"], response_format=None, template_define=PROMPT_SCENE_DEFINE, - template=_DEFAULT_TEMPLATE + PROMPT_SUFFIX, + template=_DEFAULT_TEMPLATE, stream_out=PROMPT_NEED_NEED_STREAM_OUT, output_parser=NormalChatOutputParser( sep=PROMPT_SEP, is_stream_out=PROMPT_NEED_NEED_STREAM_OUT diff --git a/pilot/scene/chat_knowledge/url/chat.py b/pilot/scene/chat_knowledge/url/chat.py index 88dc7ad0b..ce45602a2 100644 --- a/pilot/scene/chat_knowledge/url/chat.py +++ b/pilot/scene/chat_knowledge/url/chat.py @@ -38,7 +38,7 @@ class ChatUrlKnowledge(BaseChat): ) self.url = url vector_store_config = { - "vector_store_name": url, + "vector_store_name": url.replace(":", ""), "vector_store_path": KNOWLEDGE_UPLOAD_ROOT_PATH, } self.knowledge_embedding_client = KnowledgeEmbedding( diff --git a/pilot/source_embedding/knowledge_embedding.py b/pilot/source_embedding/knowledge_embedding.py index 7ec0de76c..97b515897 100644 --- a/pilot/source_embedding/knowledge_embedding.py +++ b/pilot/source_embedding/knowledge_embedding.py @@ -1,11 +1,13 @@ from typing import Optional +from chromadb.errors import NotEnoughElementsException from langchain.embeddings import HuggingFaceEmbeddings from pilot.configs.config import Config from pilot.source_embedding.csv_embedding import CSVEmbedding from pilot.source_embedding.markdown_embedding import MarkdownEmbedding from pilot.source_embedding.pdf_embedding import PDFEmbedding +from pilot.source_embedding.ppt_embedding import PPTEmbedding from pilot.source_embedding.url_embedding import URLEmbedding from pilot.source_embedding.word_embedding import WordEmbedding from pilot.vector_store.connector import VectorStoreConnector @@ -19,6 +21,8 @@ KnowledgeEmbeddingType = { ".doc": (WordEmbedding, {}), ".docx": (WordEmbedding, {}), ".csv": (CSVEmbedding, {}), + ".ppt": (PPTEmbedding, {}), + ".pptx": (PPTEmbedding, {}), } @@ -42,8 +46,12 @@ class KnowledgeEmbedding: self.knowledge_embedding_client = self.init_knowledge_embedding() self.knowledge_embedding_client.source_embedding() - def knowledge_embedding_batch(self): - self.knowledge_embedding_client.batch_embedding() + def knowledge_embedding_batch(self, docs): + # docs = self.knowledge_embedding_client.read_batch() + self.knowledge_embedding_client.index_to_store(docs) + + def read(self): + return self.knowledge_embedding_client.read_batch() def init_knowledge_embedding(self): if self.file_type == "url": @@ -68,7 +76,11 @@ class KnowledgeEmbedding: vector_client = VectorStoreConnector( CFG.VECTOR_STORE_TYPE, self.vector_store_config ) - return vector_client.similar_search(text, topk) + try: + ans = vector_client.similar_search(text, topk) + except NotEnoughElementsException: + ans = vector_client.similar_search(text, 1) + return ans def vector_exist(self): vector_client = VectorStoreConnector( diff --git a/pilot/source_embedding/markdown_embedding.py b/pilot/source_embedding/markdown_embedding.py index e2851d122..5f6d9526d 100644 --- a/pilot/source_embedding/markdown_embedding.py +++ b/pilot/source_embedding/markdown_embedding.py @@ -5,8 +5,8 @@ from typing import List import markdown from bs4 import BeautifulSoup -from langchain.document_loaders import TextLoader from langchain.schema import Document +from langchain.text_splitter import SpacyTextSplitter from pilot.configs.config import Config from pilot.source_embedding import SourceEmbedding, register @@ -30,32 +30,8 @@ class MarkdownEmbedding(SourceEmbedding): def read(self): """Load from markdown path.""" loader = EncodeTextLoader(self.file_path) - text_splitter = CHNDocumentSplitter( - pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE - ) - return loader.load_and_split(text_splitter) - - @register - def read_batch(self): - """Load from markdown path.""" - docments = [] - for root, _, files in os.walk(self.file_path, topdown=False): - for file in files: - filename = os.path.join(root, file) - loader = TextLoader(filename) - # text_splitor = CHNDocumentSplitter(chunk_size=1000, chunk_overlap=20, length_function=len) - # docs = loader.load_and_split() - docs = loader.load() - # 更新metadata数据 - new_docs = [] - for doc in docs: - doc.metadata = { - "source": doc.metadata["source"].replace(self.file_path, "") - } - print("doc is embedding ... ", doc.metadata) - new_docs.append(doc) - docments += new_docs - return docments + textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=200) + return loader.load_and_split(textsplitter) @register def data_process(self, documents: List[Document]): diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index ae8dde974..66b0963d9 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -29,7 +29,7 @@ class PDFEmbedding(SourceEmbedding): # pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE # ) textsplitter = SpacyTextSplitter( - pipeline="zh_core_web_sm", chunk_size=1000, chunk_overlap=200 + pipeline="zh_core_web_sm", chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=200 ) return loader.load_and_split(textsplitter) diff --git a/pilot/source_embedding/ppt_embedding.py b/pilot/source_embedding/ppt_embedding.py new file mode 100644 index 000000000..869e92395 --- /dev/null +++ b/pilot/source_embedding/ppt_embedding.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from typing import List + +from langchain.document_loaders import UnstructuredPowerPointLoader +from langchain.schema import Document +from langchain.text_splitter import SpacyTextSplitter + +from pilot.configs.config import Config +from pilot.source_embedding import SourceEmbedding, register + +CFG = Config() + + +class PPTEmbedding(SourceEmbedding): + """ppt embedding for read ppt document.""" + + def __init__(self, file_path, vector_store_config): + """Initialize with pdf path.""" + super().__init__(file_path, vector_store_config) + self.file_path = file_path + self.vector_store_config = vector_store_config + + @register + def read(self): + """Load from ppt path.""" + loader = UnstructuredPowerPointLoader(self.file_path) + textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=200) + return loader.load_and_split(textsplitter) + + @register + def data_process(self, documents: List[Document]): + i = 0 + for d in documents: + documents[i].page_content = d.page_content.replace("\n", "") + i += 1 + return documents diff --git a/pilot/source_embedding/source_embedding.py b/pilot/source_embedding/source_embedding.py index 50c7044f9..3d881fcdf 100644 --- a/pilot/source_embedding/source_embedding.py +++ b/pilot/source_embedding/source_embedding.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- from abc import ABC, abstractmethod from typing import Dict, List, Optional + +from chromadb.errors import NotEnoughElementsException from pilot.configs.config import Config from pilot.vector_store.connector import VectorStoreConnector @@ -62,7 +64,11 @@ class SourceEmbedding(ABC): @register def similar_search(self, doc, topk): """vector store similarity_search""" - return self.vector_client.similar_search(doc, topk) + try: + ans = self.vector_client.similar_search(doc, topk) + except NotEnoughElementsException: + ans = self.vector_client.similar_search(doc, 1) + return ans def vector_name_exist(self): return self.vector_client.vector_name_exists() @@ -79,14 +85,11 @@ class SourceEmbedding(ABC): if "index_to_store" in registered_methods: self.index_to_store(text) - def batch_embedding(self): - if "read_batch" in registered_methods: - text = self.read_batch() + def read_batch(self): + if "read" in registered_methods: + text = self.read() if "data_process" in registered_methods: text = self.data_process(text) if "text_split" in registered_methods: self.text_split(text) - if "text_to_vector" in registered_methods: - self.text_to_vector(text) - if "index_to_store" in registered_methods: - self.index_to_store(text) + return text diff --git a/pilot/summary/db_summary_client.py b/pilot/summary/db_summary_client.py index 51f124f62..84fbf1550 100644 --- a/pilot/summary/db_summary_client.py +++ b/pilot/summary/db_summary_client.py @@ -32,13 +32,14 @@ class DBSummaryClient: model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL] ) vector_store_config = { - "vector_store_name": dbname + "_profile", + "vector_store_name": dbname + "_summary", "embeddings": embeddings, } embedding = StringEmbedding( file_path=db_summary_client.get_summery(), vector_store_config=vector_store_config, ) + self.init_db_profile(db_summary_client, dbname, embeddings) if not embedding.vector_name_exist(): if CFG.SUMMARY_CONFIG == "FAST": for vector_table_info in db_summary_client.get_summery(): @@ -69,10 +70,22 @@ class DBSummaryClient: logger.info("db summary embedding success") + def get_db_summary(self, dbname, query, topk): + vector_store_config = { + "vector_store_name": dbname + "_profile", + } + knowledge_embedding_client = KnowledgeEmbedding( + model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], + vector_store_config=vector_store_config, + ) + table_docs =knowledge_embedding_client.similar_search(query, topk) + ans = [d.page_content for d in table_docs] + return ans + def get_similar_tables(self, dbname, query, topk): """get user query related tables info""" vector_store_config = { - "vector_store_name": dbname + "_profile", + "vector_store_name": dbname + "_summary", } knowledge_embedding_client = KnowledgeEmbedding( model_name=LLM_MODEL_CONFIG[CFG.EMBEDDING_MODEL], @@ -112,6 +125,29 @@ class DBSummaryClient: for dbname in dbs: self.db_summary_embedding(dbname) + def init_db_profile(self, db_summary_client, dbname, embeddings): + profile_store_config = { + "vector_store_name": dbname + "_profile", + "embeddings": embeddings, + } + embedding = StringEmbedding( + file_path=db_summary_client.get_db_summery(), + vector_store_config=profile_store_config, + ) + if not embedding.vector_name_exist(): + docs = [] + docs.extend(embedding.read_batch()) + for table_summary in db_summary_client.table_info_json(): + embedding = StringEmbedding( + table_summary, + profile_store_config, + ) + docs.extend(embedding.read_batch()) + embedding.index_to_store(docs) + logger.info("init db profile success...") + + + def _get_llm_response(query, db_input, dbsummary): chat_param = { diff --git a/pilot/summary/mysql_db_summary.py b/pilot/summary/mysql_db_summary.py index a50b24f94..4a578fe2c 100644 --- a/pilot/summary/mysql_db_summary.py +++ b/pilot/summary/mysql_db_summary.py @@ -5,6 +5,43 @@ from pilot.summary.db_summary import DBSummary, TableSummary, FieldSummary, Inde CFG = Config() +# { +# "database_name": "mydatabase", +# "tables": [ +# { +# "table_name": "customers", +# "columns": [ +# {"name": "id", "type": "int(11)", "is_primary_key": true}, +# {"name": "name", "type": "varchar(255)", "is_primary_key": false}, +# {"name": "email", "type": "varchar(255)", "is_primary_key": false} +# ], +# "indexes": [ +# {"name": "PRIMARY", "type": "primary", "columns": ["id"]}, +# {"name": "idx_name", "type": "index", "columns": ["name"]}, +# {"name": "idx_email", "type": "index", "columns": ["email"]} +# ], +# "size_in_bytes": 1024, +# "rows": 1000 +# }, +# { +# "table_name": "orders", +# "columns": [ +# {"name": "id", "type": "int(11)", "is_primary_key": true}, +# {"name": "customer_id", "type": "int(11)", "is_primary_key": false}, +# {"name": "order_date", "type": "date", "is_primary_key": false}, +# {"name": "total_amount", "type": "decimal(10,2)", "is_primary_key": false} +# ], +# "indexes": [ +# {"name": "PRIMARY", "type": "primary", "columns": ["id"]}, +# {"name": "fk_customer_id", "type": "foreign_key", "columns": ["customer_id"], "referenced_table": "customers", "referenced_columns": ["id"]} +# ], +# "size_in_bytes": 2048, +# "rows": 500 +# } +# ], +# "qps": 100, +# "tps": 50 +# } class MysqlSummary(DBSummary): """Get mysql summary template.""" @@ -13,7 +50,7 @@ class MysqlSummary(DBSummary): self.name = name self.type = "MYSQL" self.summery = ( - """database name:{name}, database type:{type}, table infos:{table_info}""" + """{{"database_name": "{name}", "type": "{type}", "tables": "{tables}", "qps": "{qps}", "tps": {tps}}}""" ) self.tables = {} self.tables_info = [] @@ -31,12 +68,14 @@ class MysqlSummary(DBSummary): ) tables = self.db.get_table_names() self.table_comments = self.db.get_table_comments(name) + comment_map = {} for table_comment in self.table_comments: self.tables_info.append( "table name:{table_name},table description:{table_comment}".format( table_name=table_comment[0], table_comment=table_comment[1] ) ) + comment_map[table_comment[0]] = table_comment[1] vector_table = json.dumps( {"table_name": table_comment[0], "table_description": table_comment[1]} @@ -45,11 +84,18 @@ class MysqlSummary(DBSummary): vector_table.encode("utf-8").decode("unicode_escape") ) self.table_columns_info = [] + self.table_columns_json = [] + for table_name in tables: - table_summary = MysqlTableSummary(self.db, name, table_name) + table_summary = MysqlTableSummary(self.db, name, table_name, comment_map) # self.tables[table_name] = table_summary.get_summery() self.tables[table_name] = table_summary.get_columns() self.table_columns_info.append(table_summary.get_columns()) + # self.table_columns_json.append(table_summary.get_summary_json()) + table_profile = "table name:{table_name},table description:{table_comment}".format( + table_name=table_name, table_comment=self.db.get_show_create_table(table_name) + ) + self.table_columns_json.append(table_profile) # self.tables_info.append(table_summary.get_summery()) def get_summery(self): @@ -60,23 +106,29 @@ class MysqlSummary(DBSummary): name=self.name, type=self.type, table_info=";".join(self.tables_info) ) + def get_db_summery(self): + return self.summery.format( + name=self.name, type=self.type, tables=";".join(self.vector_tables_info), qps=1000, tps=1000 + ) + def get_table_summary(self): return self.tables def get_table_comments(self): return self.table_comments - def get_columns(self): - return self.table_columns_info + def table_info_json(self): + return self.table_columns_json class MysqlTableSummary(TableSummary): """Get mysql table summary template.""" - def __init__(self, instance, dbname, name): + def __init__(self, instance, dbname, name, comment_map): self.name = name self.dbname = dbname self.summery = """database name:{dbname}, table name:{name}, have columns info: {fields}, have indexes info: {indexes}""" + self.json_summery_template = """{{"table_name": "{name}", "comment": "{comment}", "columns": "{fields}", "indexes": "{indexes}", "size_in_bytes": {size_in_bytes}, "rows": {rows}}}""" self.fields = [] self.fields_info = [] self.indexes = [] @@ -100,6 +152,10 @@ class MysqlTableSummary(TableSummary): self.indexes.append(index_summary) self.indexes_info.append(index_summary.get_summery()) + self.json_summery = self.json_summery_template.format( + name=name, comment=comment_map[name], fields=self.fields_info, indexes=self.indexes_info, size_in_bytes=1000, rows=1000 + ) + def get_summery(self): return self.summery.format( name=self.name, @@ -111,20 +167,24 @@ class MysqlTableSummary(TableSummary): def get_columns(self): return self.column_summery + def get_summary_json(self): + return self.json_summery + class MysqlFieldsSummary(FieldSummary): """Get mysql field summary template.""" def __init__(self, field): self.name = field[0] - self.summery = """column name:{name}, column data type:{data_type}, is nullable:{is_nullable}, default value is:{default_value}, comment is:{comment} """ + # self.summery = """column name:{name}, column data type:{data_type}, is nullable:{is_nullable}, default value is:{default_value}, comment is:{comment} """ + # self.summery = """{"name": {name}, "type": {data_type}, "is_primary_key": {is_nullable}, "comment":{comment}, "default":{default_value}}""" self.data_type = field[1] self.default_value = field[2] self.is_nullable = field[3] self.comment = field[4] def get_summery(self): - return self.summery.format( + return '{{"name": "{name}", "type": "{data_type}", "is_primary_key": "{is_nullable}", "comment": "{comment}", "default": "{default_value}"}}'.format( name=self.name, data_type=self.data_type, is_nullable=self.is_nullable, @@ -138,11 +198,12 @@ class MysqlIndexSummary(IndexSummary): def __init__(self, index): self.name = index[0] - self.summery = """index name:{name}, index bind columns:{bind_fields}""" + # self.summery = """index name:{name}, index bind columns:{bind_fields}""" + self.summery_template = '{{"name": "{name}", "columns": {bind_fields}}}' self.bind_fields = index[1] def get_summery(self): - return self.summery.format(name=self.name, bind_fields=self.bind_fields) + return self.summery_template.format(name=self.name, bind_fields=self.bind_fields) if __name__ == "__main__": diff --git a/tools/knowlege_init.py b/tools/knowlege_init.py index ff13865b4..26338df1c 100644 --- a/tools/knowlege_init.py +++ b/tools/knowlege_init.py @@ -23,7 +23,7 @@ class LocalKnowledgeInit: self.vector_store_config = vector_store_config self.model_name = LLM_MODEL_CONFIG["text2vec"] - def knowledge_persist(self, file_path, append_mode): + def knowledge_persist(self, file_path): """knowledge persist""" for root, _, files in os.walk(file_path, topdown=False): for file in files: @@ -41,7 +41,6 @@ class LocalKnowledgeInit: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--vector_name", type=str, default="default") - parser.add_argument("--append", type=bool, default=False) args = parser.parse_args() vector_name = args.vector_name append_mode = args.append @@ -49,5 +48,5 @@ if __name__ == "__main__": vector_store_config = {"vector_store_name": vector_name} print(vector_store_config) kv = LocalKnowledgeInit(vector_store_config=vector_store_config) - kv.knowledge_persist(file_path=DATASETS_DIR, append_mode=append_mode) + kv.knowledge_persist(file_path=DATASETS_DIR) print("your knowledge embedding success...")