feat(model): llama.cpp support new GGUF file format (#649)

Close #567 Close #644 Close #563 **Other** - Fix raise Exception when stop DB-GPT
2025-08-08 11:47:44 +00:00 · 2023-10-07 22:50:57 +08:00 · 2023-10-07 22:50:57 +08:00 · f2427b10f5
commit f2427b10f5
parent 85e32f79ac 2c17074652
10 changed files with 168 additions and 129 deletions
--- a/.env.template
+++ b/.env.template
@ -44,7 +44,7 @@ QUANTIZE_8bit=True
 ## llama-2-70b must be 8
 # llama_cpp_n_gqa=8
 ## Model path
-# llama_cpp_model_path=/data/models/TheBloke/vicuna-7B-v1.5-GGML/vicuna-7b-v1.5.ggmlv3.q4_0.bin
+# llama_cpp_model_path=/data/models/TheBloke/vicuna-13B-v1.5-GGUF/vicuna-13b-v1.5.Q4_K_M.gguf
 #*******************************************************************#
 #**                         EMBEDDING SETTINGS                    **#
--- a/docs/getting_started/install/llm/llama/llama_cpp.md
+++ b/docs/getting_started/install/llm/llama/llama_cpp.md
@ -8,19 +8,19 @@ DB-GPT already supports [llama.cpp](https://github.com/ggerganov/llama.cpp) via
 ### Preparing Model Files
-To use llama.cpp, you need to prepare a ggml format model file, and there are two common ways to obtain it, you can choose either:
+To use llama.cpp, you need to prepare a gguf format model file, and there are two common ways to obtain it, you can choose either:
 1. Download a pre-converted model file.
-Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), you can download the file already converted from [TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.bin`.
+Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5), you can download the file already converted from [TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.gguf`.
 ```bash
-wget https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML/resolve/main/vicuna-7b-v1.5.ggmlv3.q4_K_M.bin -O models/ggml-model-q4_0.bin
+wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
 ```
 2. Convert It Yourself
-You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.bin`.
+You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.gguf`.
 ### Installing Dependencies
@ -46,9 +46,9 @@ Then you can run it according to [Run](https://db-gpt.readthedocs.io/en/latest/g
 In DB-GPT, the model configuration can be done through  `{model name}_{config key}`.
-| Environment Variable Key      | default | Prompt Template Name|
+| Environment Variable Key      | default | Description |
 |----------|-----------| ----------- |
-| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, If None, the prompt template is automatically determined from model path。 |
+| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-chat,internlm-chat`, If None, the prompt template is automatically determined from model path。 |
 | llama_cpp_model_path |  None  | Model path |
 | llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` | 
 | llama_cpp_n_threads |  None  | Number of threads to use. If None, the number of threads is automatically determined |
--- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po
+++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/install/llm/llama/llama_cpp.po
@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: DB-GPT 👏👏 0.3.5\n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2023-08-21 16:59+0800\n"
+"POT-Creation-Date: 2023-10-07 20:28+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@ -20,274 +20,275 @@ msgstr ""
 "Generated-By: Babel 2.12.1\n"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:1
-#: 24d5c21cd8b44f1d8585ba5c83e34acc
+#: 95a9a605d97346fb98e0c0977524d354
 msgid "llama.cpp"
 msgstr "llama.cpp"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:5
-#: 56969ff863d949aa8df55d3bdb6957e7
+#: ebe3be273a42492d9832512554b4b7dc
 msgid ""
 "DB-GPT already supports "
 "[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-"
 "python](https://github.com/abetlen/llama-cpp-python)."
 msgstr ""
 "DB-GPT已经通过[llama-cpp-python](https://github.com/abetlen/llama-cpp-"
 "python)支持[llama.cpp](https://github.com/ggerganov/llama.cpp)。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:7
-#: afe223eafcc641779e1580cac574c34a
+#: 97a4f6f95d6845258e3753803fc117a3
 msgid "Running llama.cpp"
 msgstr "运行 llama.cpp"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:9
-#: 0eaf98a036434eecb2af1fa89f045620
+#: 40fcdf93fe3d4542bbd84ed2d5a82623
 msgid "Preparing Model Files"
 msgstr "准备模型文件"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:11
-#: 4f45be5d9658451fb95f1d5d31dc8778
+#: f10bd034d24640d3b83572d50b2a9f71
 msgid ""
-"To use llama.cpp, you need to prepare a ggml format model file, and there"
+"To use llama.cpp, you need to prepare a gguf format model file, and there"
 " are two common ways to obtain it, you can choose either:"
-msgstr "使用llama.cpp, 你需要准备ggml格式的文件，你可以通过以下两种方法获取"
+msgstr "使用 llama.cpp，你需要准备 gguf 格式的文件，你可以通过以下两种方法获取"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:13
-#: 9934596e0f6e466aae63cefbb019e0ec
+#: fb143586b13849f0bb2b6ae0c9408e95
 msgid "Download a pre-converted model file."
-msgstr "Download a pre-converted model file."
+msgstr "下载已转换的模型文件"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:15
-#: 33fef76961064a5ca4c86c57111c8bd3
+#: a6e89c960ebd4778b8fc72d3d43e9543
 msgid ""
-"Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys"
+"Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys"
-"/vicuna-7b-v1.5), you can download the file already converted from "
+"/vicuna-13b-v1.5), you can download the file already converted from "
-"[TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
+"[TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
-"7B-v1.5-GGML), only one file is needed. Download it to the `models` "
+"13B-v1.5-GGUF), only one file is needed. Download it to the `models` "
-"directory and rename it to `ggml-model-q4_0.bin`."
+"directory and rename it to `ggml-model-q4_0.gguf`."
 msgstr ""
-"假设您想使用[Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-"
+"假设您想使用[Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-"
-"7b-v1.5)您可以从[TheBloke/vicuna-"
+"13b-v1.5)您可以从[TheBloke/vicuna-"
-"7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
+"13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
-"7B-v1.5-GGML)下载已转换的文件，只需要一个文件。将其下载到models目录并将其重命名为ggml-model-q4_0.bin。"
+"13B-v1.5-GGUF)下载已转换的文件，只需要一个文件。将其下载到models目录并将其重命名为 `ggml-"
 "model-q4_0.gguf`。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:21
-#: 65fed5b7e95b4205b2b94596a21b6fe8
+#: 380ebad2c5a04210a48c5d7a9913413d
 msgid "Convert It Yourself"
-msgstr "Convert It Yourself"
+msgstr "自行转换"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:23
-#: 1421761d320046f79f725e64bd7d854c
+#: cf39ca73d9c6456794fb240b164b7cbb
 msgid ""
 "You can convert the model file yourself according to the instructions in "
 "[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
 "#prepare-data--run), and put the converted file in the models directory "
-"and rename it to `ggml-model-q4_0.bin`."
+"and rename it to `ggml-model-q4_0.gguf`."
 msgstr ""
 "您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
-"#prepare-data--run)中的说明自己转换模型文件，然后将转换后的文件放入models目录中，并将其重命名为ggml-"
+"#prepare-data--run)中的说明自行转换模型文件，并把转换后的文件放在models目录中，并重命名为`ggml-"
-"model-q4_0.bin。"
+"model-q4_0.gguf`。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:25
-#: 850b1f8ef6be49b192e01c1b7d8f1f26
+#: 363cbf1c0b4e4029982519238f776958
 msgid "Installing Dependencies"
 msgstr "安装依赖"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:27
-#: b323ee4799d745cc9c0a449bd37c371a
+#: a98c36e3d7df40f3a816c0ee451b6114
 msgid ""
 "llama.cpp is an optional dependency in DB-GPT, and you can manually "
 "install it using the following command:"
-msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过一下命令进行安装"
+msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过以下命令进行安装"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:33
-#: 75b75c84ffb7476d8501a28bb2719615
+#: b0038a8ba36647c6a62eef907cb6d304
 msgid "Modifying the Configuration File"
 msgstr "修改配置文件"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:35
-#: d1f8b3e1ad3441f2aafbfe2519113c2c
+#: d2002da716744122a44ab4ed2e47e680
 msgid "Next, you can directly modify your `.env` file to enable llama.cpp."
 msgstr "修改`.env`文件使用llama.cpp"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:42
-#: 2ddcab3834f646e58a8b3316abf6ce3a
+#: 97a5fb5d4ed649f5aa0bbb97c32d54b0
 msgid ""
 "Then you can run it according to [Run](https://db-"
 "gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)."
 msgstr ""
-"然后你可以通过[Run](https://db-"
+"然后你可以根据[运行]"
-"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run).来运行"
+"(https://db-gpt.readthedocs.io/projects/db-gpt-docs-zh-cn/zh_CN/latest/getting_started/install/deploy/deploy.html#run)来运行"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:45
-#: bb9f222d22534827a9fa164b2126d192
+#: 0e3771b6aaa141f89c813507f3317bda
 msgid "More Configurations"
 msgstr "更多配置文件"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:47
-#: 14d016ad5bad451888d01e24f0ca86d9
+#: 0802ba524cd1458298fe6f90ae7f2da1
 msgid ""
 "In DB-GPT, the model configuration can be done through  `{model "
 "name}_{config key}`."
-msgstr ""
+msgstr "在DB-GPT中，模型配置可以通过`{模型名称}_{配置名}` 来配置。"
 "In DB-GPT, the model configuration can be done through  `{model "
 "name}_{config key}`."
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: a1bf4c1f49bd4d97ac45d4f3aff442c6
+#: d461d379a523424fb5885e393498ee14
 msgid "Environment Variable Key"
-msgstr "Environment Variable Key"
+msgstr "环境变量键"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 92692a38219c432fadffb8b3825ce678
+#: 0263477d0ddb4914baa0d3584b751086
 msgid "default"
-msgstr "default"
+msgstr "默认值"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 72b2d251aa2e4ca09c335b58e1a08de3
+#: e5188d0ded6540a0bddb46d480f8b7ac
-msgid "Prompt Template Name"
+msgid "Description"
-msgstr "Prompt Template Name"
+msgstr "描述"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 85a9f89eeb9a4b70b56913354e947329
+#: 213b27d0e53d4858b7576dc4f2ab4d7f
 msgid "llama_cpp_prompt_template"
 msgstr "llama_cpp_prompt_template"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 17e9750fbb824dfdaaed5415f6406e35 602016763bb2470d8a8ef700e576407b
+#: 1cb0320826564a89a3e2f51177f8a6ed 23d93dc7d88e431ba31ff64d239a412f
-#: 790caafd5c4c4cecbb4c190745fb994c ceb6c41315ab4c5798ab3c64ee8693eb
+#: 833d5012411a4ad58b04d50a40a29184 95aa2102191946919158ae668b2e3599
-#: cfafab69a2684e27bd55aadfdd4c1575
+#: becdd178292a48138dcb445ba3c2a6ec
 msgid "None"
 msgstr "None"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 5d02f2d1d5834b1e9e5d6982247fd6c9
+#: ac835806c79640aa8cd39edb11d7667c
 msgid ""
-"Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2"
+"Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2"
-",baichuan-chat`, If None, the prompt template is automatically determined"
+",baichuan-chat,internlm-chat`, If None, the prompt template is "
-" from model path。"
+"automatically determined from model path。"
 msgstr ""
-"Prompt template 现在可以支持`zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, "
+"Prompt template 现在可以支持`zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-"
-"如果是None, the prompt template可以自动选择模型路径"
+"chat,internlm-chat`, 如果是None, 可以根据模型路径来自动获取模型 Prompt template"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 2a95bc11386f45498b3585b194f24c17
+#: 41bce5a6bbf2417f8bc40e71c59405ad
 msgid "llama_cpp_model_path"
 msgstr "llama_cpp_model_path"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: c02db8a50e7a4df0acb6b75798a3ad4b
+#: 15df4d19645b40e7a209827f9a325b8f
 msgid "Model path"
-msgstr "Model path"
+msgstr "模型路径"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 6c92b2ec52634728bcc421670cdda70b
+#: caf9ddbfb787418d8b167746e3febe8c
 msgid "llama_cpp_n_gpu_layers"
 msgstr "llama_cpp_n_gpu_layers"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 9f1e1b763a0b40d28efd734fe20e1ba7
+#: e12e0ed2c01e4d12b41d5da533073c53
 msgid "1000000000"
 msgstr "1000000000"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 0f511b7907594c1f9c9818638764f209
+#: 1f4a868d3fed4ac78bfa48e13b3a59dc
 msgid ""
 "Number of layers to offload to the GPU, Set this to 1000000000 to offload"
 " all layers to the GPU. If your GPU VRAM is not enough, you can set a low"
 " number, eg: `10`"
-msgstr "要将层数转移到GPU上，将其设置为1000000000以将所有层转移到GPU上。如果您的GPU VRAM不足，可以设置较低的数字，例如：10。"
+msgstr "要将多少网络层转移到GPU上，将其设置为1000000000以将所有层转移到GPU上。如果您的 GPU 内存不足，可以设置较低的数字，例如：10。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 1ffdfa4eb78d4127b302b6d703852692
+#: 306e083489e24f819d67f38e2f155f0f
 msgid "llama_cpp_n_threads"
 msgstr "llama_cpp_n_threads"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: f14379e7ea16476da403d5085b67db1c
+#: 0490a543f67f4ecd8588541399846951
 msgid ""
 "Number of threads to use. If None, the number of threads is automatically"
 " determined"
 msgstr "要使用的线程数量。如果为None，则线程数量将自动确定。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 41cc1035f6e340e19848452d48a161db
+#: 2ad3f09e1f894e30ae512e1cd803af52
 msgid "llama_cpp_n_batch"
 msgstr "llama_cpp_n_batch"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 993c3b9218ee4299beae53bd75a01001
+#: c495776868394df5b311087dfc7c55dd
 msgid "512"
 msgstr "512"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 0e11d38c9b58478cacdade34de146320
+#: b5e69dc488cc4ae78ee9daefcf73c290
 msgid "Maximum number of prompt tokens to batch together when calling llama_eval"
 msgstr "在调用llama_eval时，批处理在一起的prompt tokens的最大数量"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 24f5381956d34569aabee4a5d832388b
+#: 516cfc3ed00c4a6181f37a4649c9f041
 msgid "llama_cpp_n_gqa"
 msgstr "llama_cpp_n_gqa"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 07d05844541c452caaa8d5bf56c3f8a1
+#: 51847a305c4341af8614a2ceb7aa658f
 msgid "Grouped-query attention. Must be 8 for llama-2 70b."
-msgstr "对于llama-2 70b模型，Grouped-query attention必须为8。"
+msgstr "对于 llama-2 70B 模型，Grouped-query attention 必须为8。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 40a1b9750d854bb19dc18b7d530beccf
+#: 8261108709f341dab19e4fece7682c0c
 msgid "llama_cpp_rms_norm_eps"
 msgstr "llama_cpp_rms_norm_eps"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 6018ee183b9548eabf91e9fc683e7c24
+#: 72cc3d9988414f489ddefe3afb332e83
 msgid "5e-06"
 msgstr "5e-06"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: eb273c6bcf2c4c47808024008ce230dc
+#: ebc1baebf57e4009b0fdfa68eb055d80
 msgid "5e-6 is a good value for llama-2 models."
 msgstr "对于llama-2模型来说，5e-6是一个不错的值。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: f70f3e935b764b6f9544d201ba2aaa05
+#: 0cc1199e293741f087c795230d9c8dda
 msgid "llama_cpp_cache_capacity"
 msgstr "llama_cpp_cache_capacity"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 70035ec5be244eda9fe93be3df2c66df
+#: 7d13612da75046b1a3fc0877e229bb91
 msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB"
-msgstr "cache capacity最大值. Examples: 2000MiB, 2GiB"
+msgstr "模型缓存最大值. 例如: 2000MiB, 2GiB"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 164c31b005ae4979938d9bc67e7f2759
+#: 53332858d3a8472f8eb59d845c594ffd
 msgid "llama_cpp_prefer_cpu"
 msgstr "llama_cpp_prefer_cpu"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: 28f890f6bee3412e94aeb1326367326e
+#: 7ff31fe3233a4243840584bc069654cd
 msgid "False"
 msgstr "False"
 #: ../../getting_started/install/llm/llama/llama_cpp.md
-#: f8f27b6323384431ba064a720f39f997
+#: 62d1dbd4f8254141a697448a7a5f6701
 msgid ""
 "If a GPU is available, it will be preferred by default, unless "
 "prefer_cpu=False is configured."
 msgstr "如果有可用的GPU，默认情况下会优先使用GPU，除非配置了 prefer_cpu=False。"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:61
-#: 0471e56c790047bab422aa47edad0a15
+#: 8de97de28d1a40c3b852a1268255ebed
 msgid "GPU Acceleration"
 msgstr "GPU 加速"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:63
-#: e95ad40d29004455bebeec8a1a7248c8
+#: 8bce74c0ddb5486190ff4d36fd5358be
 msgid ""
 "GPU acceleration is supported by default. If you encounter any issues, "
 "you can uninstall the dependent packages with the following command:"
 msgstr "默认情况下支持GPU加速。如果遇到任何问题，您可以使用以下命令卸载相关的依赖包"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:68
-#: c0caf1420e43437589693ddec96bd50f
+#: 1f3fe88521614d499cb1d046f8d3c125
 msgid ""
 "Then install `llama-cpp-python` according to the instructions in [llama-"
 "cpp-python](https://github.com/abetlen/llama-cpp-"
@ -297,24 +298,24 @@ msgstr ""
 "python/blob/main/README.md).安装`llama-cpp-python`"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:71
-#: fe082f65b4e9416c97b18e5005bc0a59
+#: fc83106f0a0e4ddfb3c058bec62f4568
 msgid "Mac Usage"
-msgstr "Mac Usage"
+msgstr "Mac 使用"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:73
-#: 6f30d3fa399f434189fcb03d28a42d2d
+#: dcf5904a444342c8a768c4da8b777828
 msgid ""
 "Special attention, if you are using Apple Silicon (M1) Mac, it is highly "
 "recommended to install arm64 architecture python support, for example:"
 msgstr "特别注意：如果您正在使用苹果芯片（M1）的Mac电脑，强烈建议安装 arm64 架构的 Python 支持，例如："
 #: ../../getting_started/install/llm/llama/llama_cpp.md:80
-#: 74602bede3c5472fbabc7de47eb2ff7a
+#: 547369c011a9412589dad1fac7ac3ef9
 msgid "Windows Usage"
 msgstr "Windows使用"
 #: ../../getting_started/install/llm/llama/llama_cpp.md:82
-#: ae78332a348b44cb847723a998b98048
+#: 506fda57977f4aa8b9fe427e3c66f4d7
 msgid ""
 "The use under the Windows platform has not been rigorously tested and "
 "verified, and you are welcome to use it. If you have any problems, you "
@ -323,8 +324,8 @@ msgid ""
 "information) directly."
 msgstr ""
 "在Windows平台上的使用尚未经过严格的测试和验证，欢迎您使用。如果您有任何问题，可以创建一个[issue](https://github.com"
-"/eosphoros-ai/DB-GPT/issues)或者[contact us](https://github.com/eosphoros-"
+"/eosphoros-ai/DB-GPT/issues)或者直接[联系我们](https://github.com/eosphoros-ai"
-"ai/DB-GPT/tree/main#contact-information) directly."
+"/DB-GPT/tree/main#cntact-information)。"
 #~ msgid ""
 #~ "DB-GPT is now supported by "
@ -337,3 +338,6 @@ msgstr ""
 #~ "cpp-python) through "
 #~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)."
 #~ msgid "Prompt Template Name"
 #~ msgstr "Prompt Template Name"
--- a/pilot/configs/model_config.py
+++ b/pilot/configs/model_config.py
@ -70,7 +70,8 @@ LLM_MODEL_CONFIG = {
    "baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"),
    # (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2
    "wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"),
-    "llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.bin"),
+    # wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
    "llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.gguf"),
    # https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288
    "internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"),
    "internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"),
--- a/pilot/model/adapter.py
+++ b/pilot/model/adapter.py
@ -382,14 +382,14 @@ class LlamaCppAdapater(BaseLLMAdaper):
            # Just support local model
            return False, None
        if not path.is_file():
-            model_paths = list(path.glob("*ggml*.bin"))
+            model_paths = list(path.glob("*ggml*.gguf"))
            if not model_paths:
                return False
            model_path = str(model_paths[0])
            logger.warn(
-                f"Model path {model_path} is not single file, use first *gglm*.bin model file: {model_path}"
+                f"Model path {model_path} is not single file, use first *gglm*.gguf model file: {model_path}"
            )
-        if not re.fullmatch(".*ggml.*\.bin", model_path):
+        if not re.fullmatch(".*ggml.*\.gguf", model_path):
            return False, None
        return True, model_path
--- a/pilot/model/cluster/manager_base.py
+++ b/pilot/model/cluster/manager_base.py
@ -33,7 +33,7 @@ class WorkerManager(ABC):
        """Start worker manager"""
    @abstractmethod
-    async def stop(self):
+    async def stop(self, ignore_exception: bool = False):
        """Stop worker manager"""
    @abstractmethod
--- a/pilot/model/cluster/worker/manager.py
+++ b/pilot/model/cluster/worker/manager.py
@ -115,14 +115,30 @@ class LocalWorkerManager(WorkerManager):
        for listener in self.start_listeners:
            listener(self)
-    async def stop(self):
+    async def stop(self, ignore_exception: bool = False):
        if not self.run_data.stop_event.is_set():
            logger.info("Stop all workers")
            self.run_data.stop_event.clear()
            stop_tasks = []
-            stop_tasks.append(self._stop_all_worker(apply_req=None))
+            stop_tasks.append(
                self._stop_all_worker(apply_req=None, ignore_exception=ignore_exception)
            )
            if self.deregister_func:
                # If ignore_exception is True, use exception handling to ignore any exceptions raised from self.deregister_func
                if ignore_exception:
                    async def safe_deregister_func(run_data):
                        try:
                            await self.deregister_func(run_data)
                        except Exception as e:
                            logger.warning(
                                f"Stop worker, ignored exception from deregister_func: {e}"
                            )
                    stop_tasks.append(safe_deregister_func(self.run_data))
                else:
                    stop_tasks.append(self.deregister_func(self.run_data))
            await asyncio.gather(*stop_tasks)
    def after_start(self, listener: Callable[["WorkerManager"], None]):
@ -424,7 +440,7 @@ class LocalWorkerManager(WorkerManager):
        )
    async def _stop_all_worker(
-        self, apply_req: WorkerApplyRequest
+        self, apply_req: WorkerApplyRequest, ignore_exception: bool = False
    ) -> WorkerApplyOutput:
        start_time = time.time()
@ -441,7 +457,19 @@ class LocalWorkerManager(WorkerManager):
                and self.register_func
                and self.deregister_func
            ):
-                await self.deregister_func(worker_run_data)
+                _deregister_func = self.deregister_func
                if ignore_exception:
                    async def safe_deregister_func(run_data):
                        try:
                            await self.deregister_func(run_data)
                        except Exception as e:
                            logger.warning(
                                f"Stop worker, ignored exception from deregister_func: {e}"
                            )
                    _deregister_func = safe_deregister_func
                await _deregister_func(worker_run_data)
        await self._apply_worker(apply_req, _stop_worker)
        timecost = time.time() - start_time
@ -487,8 +515,8 @@ class WorkerManagerAdapter(WorkerManager):
    async def start(self):
        return await self.worker_manager.start()
-    async def stop(self):
+    async def stop(self, ignore_exception: bool = False):
-        return await self.worker_manager.stop()
+        return await self.worker_manager.stop(ignore_exception=ignore_exception)
    def after_start(self, listener: Callable[["WorkerManager"], None]):
        if listener is not None:
@ -631,7 +659,9 @@ async def api_model_shutdown(request: WorkerStartupRequest):
    return await worker_manager.model_shutdown(request)
-def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
+def _setup_fastapi(
    worker_params: ModelWorkerParameters, app=None, ignore_exception: bool = False
 ):
    if not app:
        app = FastAPI()
    if worker_params.standalone:
@ -666,7 +696,7 @@ def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
    @app.on_event("shutdown")
    async def startup_event():
-        await worker_manager.stop()
+        await worker_manager.stop(ignore_exception=ignore_exception)
    return app
@ -837,7 +867,7 @@ def initialize_worker_manager_in_client(
        worker_params.register = True
        worker_params.port = local_port
        logger.info(f"Worker params: {worker_params}")
-        _setup_fastapi(worker_params, app)
+        _setup_fastapi(worker_params, app, ignore_exception=True)
        _start_local_worker(worker_manager, worker_params)
        worker_manager.after_start(start_listener)
        _start_local_embedding_worker(
--- a/pilot/model/cluster/worker/remote_manager.py
+++ b/pilot/model/cluster/worker/remote_manager.py
@ -17,7 +17,7 @@ class RemoteWorkerManager(LocalWorkerManager):
        for listener in self.start_listeners:
            listener(self)
-    async def stop(self):
+    async def stop(self, ignore_exception: bool = False):
        pass
    async def _fetch_from_worker(
--- a/pilot/server/dbgpt_server.py
+++ b/pilot/server/dbgpt_server.py
@ -44,11 +44,6 @@ static_file_path = os.path.join(os.getcwd(), "server/static")
 CFG = Config()
 def signal_handler():
    print("in order to avoid chroma db atexit problem")
    os._exit(0)
 def swagger_monkey_patch(*args, **kwargs):
    return get_swagger_ui_html(
        *args,
@ -176,7 +171,6 @@ def run_uvicorn(param: WebWerverParameters):
        port=param.port,
        log_level=logging_str_to_uvicorn_level(param.log_level),
    )
    signal.signal(signal.SIGINT, signal_handler())
 def run_webserver(param: WebWerverParameters = None):
--- a/setup.py
+++ b/setup.py
@ -14,7 +14,10 @@ from setuptools import find_packages
 with open("README.md", mode="r", encoding="utf-8") as fh:
    long_description = fh.read()
-BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true"
+BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "true").lower() == "true"
 LLAMA_CPP_GPU_ACCELERATION = (
    os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true"
 )
 def parse_requirements(file_name: str) -> List[str]:
@ -249,21 +252,29 @@ def llama_cpp_python_cuda_requires():
    if not cuda_version:
        print("CUDA not support, use cpu version")
        return
    if not LLAMA_CPP_GPU_ACCELERATION:
        print("Disable GPU acceleration")
        return
    # Supports GPU acceleration
    device = "cu" + cuda_version.replace(".", "")
    os_type, cpu_avx = get_cpu_avx_support()
    print(f"OS: {os_type}, cpu avx: {cpu_avx}")
    supported_os = [OSType.WINDOWS, OSType.LINUX]
    if os_type not in supported_os:
        print(
            f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}"
        )
        return
-    if cpu_avx == AVXType.AVX2 or AVXType.AVX512:
+    cpu_device = ""
-        cpu_avx = AVXType.AVX
+    if cpu_avx == AVXType.AVX2 or cpu_avx == AVXType.AVX512:
-    cpu_avx = cpu_avx._value_
+        cpu_device = "avx"
    else:
        cpu_device = "basic"
    device += cpu_device
    base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui"
-    llama_cpp_version = "0.1.77"
+    llama_cpp_version = "0.2.10"
    py_version = "cp310"
-    os_pkg_name = "linux_x86_64" if os_type == OSType.LINUX else "win_amd64"
+    os_pkg_name = "manylinux_2_31_x86_64" if os_type == OSType.LINUX else "win_amd64"
    extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl"
    extra_index_url, _ = encode_url(extra_index_url)
    print(f"Install llama_cpp_python_cuda from {extra_index_url}")
@ -298,7 +309,7 @@ def core_requires():
        "langchain>=0.0.286",
        "SQLAlchemy",
        "pymysql",
-        "duckdb",
+        "duckdb==0.8.1",
        "duckdb-engine",
        "jsonschema",
        # TODO move transformers to default
@ -312,7 +323,6 @@ def knowledge_requires():
    """
    setup_spec.extras["knowledge"] = [
        "spacy==3.5.3",
        # "chromadb==0.3.22",
        "chromadb==0.4.10",
        "markdown",
        "bs4",