feat(model): llama.cpp support new GGUF file format (#649)

Close #567 
Close #644
Close #563

**Other**
- Fix raise Exception when stop DB-GPT
This commit is contained in:
Aries-ckt 2023-10-07 22:50:57 +08:00 committed by GitHub
commit f2427b10f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 168 additions and 129 deletions

View File

@ -44,7 +44,7 @@ QUANTIZE_8bit=True
## llama-2-70b must be 8 ## llama-2-70b must be 8
# llama_cpp_n_gqa=8 # llama_cpp_n_gqa=8
## Model path ## Model path
# llama_cpp_model_path=/data/models/TheBloke/vicuna-7B-v1.5-GGML/vicuna-7b-v1.5.ggmlv3.q4_0.bin # llama_cpp_model_path=/data/models/TheBloke/vicuna-13B-v1.5-GGUF/vicuna-13b-v1.5.Q4_K_M.gguf
#*******************************************************************# #*******************************************************************#
#** EMBEDDING SETTINGS **# #** EMBEDDING SETTINGS **#

View File

@ -8,19 +8,19 @@ DB-GPT already supports [llama.cpp](https://github.com/ggerganov/llama.cpp) via
### Preparing Model Files ### Preparing Model Files
To use llama.cpp, you need to prepare a ggml format model file, and there are two common ways to obtain it, you can choose either: To use llama.cpp, you need to prepare a gguf format model file, and there are two common ways to obtain it, you can choose either:
1. Download a pre-converted model file. 1. Download a pre-converted model file.
Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), you can download the file already converted from [TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.bin`. Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5), you can download the file already converted from [TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.gguf`.
```bash ```bash
wget https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML/resolve/main/vicuna-7b-v1.5.ggmlv3.q4_K_M.bin -O models/ggml-model-q4_0.bin wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
``` ```
2. Convert It Yourself 2. Convert It Yourself
You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.bin`. You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.gguf`.
### Installing Dependencies ### Installing Dependencies
@ -46,9 +46,9 @@ Then you can run it according to [Run](https://db-gpt.readthedocs.io/en/latest/g
In DB-GPT, the model configuration can be done through `{model name}_{config key}`. In DB-GPT, the model configuration can be done through `{model name}_{config key}`.
| Environment Variable Key | default | Prompt Template Name| | Environment Variable Key | default | Description |
|----------|-----------| ----------- | |----------|-----------| ----------- |
| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, If None, the prompt template is automatically determined from model path。 | | llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-chat,internlm-chat`, If None, the prompt template is automatically determined from model path。 |
| llama_cpp_model_path | None | Model path | | llama_cpp_model_path | None | Model path |
| llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` | | llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` |
| llama_cpp_n_threads | None | Number of threads to use. If None, the number of threads is automatically determined | | llama_cpp_n_threads | None | Number of threads to use. If None, the number of threads is automatically determined |

View File

@ -8,7 +8,7 @@ msgid ""
msgstr "" msgstr ""
"Project-Id-Version: DB-GPT 👏👏 0.3.5\n" "Project-Id-Version: DB-GPT 👏👏 0.3.5\n"
"Report-Msgid-Bugs-To: \n" "Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-08-21 16:59+0800\n" "POT-Creation-Date: 2023-10-07 20:28+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n" "Language: zh_CN\n"
@ -20,274 +20,275 @@ msgstr ""
"Generated-By: Babel 2.12.1\n" "Generated-By: Babel 2.12.1\n"
#: ../../getting_started/install/llm/llama/llama_cpp.md:1 #: ../../getting_started/install/llm/llama/llama_cpp.md:1
#: 24d5c21cd8b44f1d8585ba5c83e34acc #: 95a9a605d97346fb98e0c0977524d354
msgid "llama.cpp" msgid "llama.cpp"
msgstr "llama.cpp" msgstr "llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:5 #: ../../getting_started/install/llm/llama/llama_cpp.md:5
#: 56969ff863d949aa8df55d3bdb6957e7 #: ebe3be273a42492d9832512554b4b7dc
msgid "" msgid ""
"DB-GPT already supports " "DB-GPT already supports "
"[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-" "[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-"
"python](https://github.com/abetlen/llama-cpp-python)." "python](https://github.com/abetlen/llama-cpp-python)."
msgstr "" msgstr ""
"DB-GPT已经通过[llama-cpp-python](https://github.com/abetlen/llama-cpp-"
"python)支持[llama.cpp](https://github.com/ggerganov/llama.cpp)。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:7 #: ../../getting_started/install/llm/llama/llama_cpp.md:7
#: afe223eafcc641779e1580cac574c34a #: 97a4f6f95d6845258e3753803fc117a3
msgid "Running llama.cpp" msgid "Running llama.cpp"
msgstr "运行 llama.cpp" msgstr "运行 llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:9 #: ../../getting_started/install/llm/llama/llama_cpp.md:9
#: 0eaf98a036434eecb2af1fa89f045620 #: 40fcdf93fe3d4542bbd84ed2d5a82623
msgid "Preparing Model Files" msgid "Preparing Model Files"
msgstr "准备模型文件" msgstr "准备模型文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:11 #: ../../getting_started/install/llm/llama/llama_cpp.md:11
#: 4f45be5d9658451fb95f1d5d31dc8778 #: f10bd034d24640d3b83572d50b2a9f71
msgid "" msgid ""
"To use llama.cpp, you need to prepare a ggml format model file, and there" "To use llama.cpp, you need to prepare a gguf format model file, and there"
" are two common ways to obtain it, you can choose either:" " are two common ways to obtain it, you can choose either:"
msgstr "使用llama.cpp, 你需要准备ggml格式的文件,你可以通过以下两种方法获取" msgstr "使用 llama.cpp你需要准备 gguf 格式的文件,你可以通过以下两种方法获取"
#: ../../getting_started/install/llm/llama/llama_cpp.md:13 #: ../../getting_started/install/llm/llama/llama_cpp.md:13
#: 9934596e0f6e466aae63cefbb019e0ec #: fb143586b13849f0bb2b6ae0c9408e95
msgid "Download a pre-converted model file." msgid "Download a pre-converted model file."
msgstr "Download a pre-converted model file." msgstr "下载已转换的模型文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:15 #: ../../getting_started/install/llm/llama/llama_cpp.md:15
#: 33fef76961064a5ca4c86c57111c8bd3 #: a6e89c960ebd4778b8fc72d3d43e9543
msgid "" msgid ""
"Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys" "Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys"
"/vicuna-7b-v1.5), you can download the file already converted from " "/vicuna-13b-v1.5), you can download the file already converted from "
"[TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-" "[TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
"7B-v1.5-GGML), only one file is needed. Download it to the `models` " "13B-v1.5-GGUF), only one file is needed. Download it to the `models` "
"directory and rename it to `ggml-model-q4_0.bin`." "directory and rename it to `ggml-model-q4_0.gguf`."
msgstr "" msgstr ""
"假设您想使用[Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-" "假设您想使用[Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-"
"7b-v1.5)您可以从[TheBloke/vicuna-" "13b-v1.5)您可以从[TheBloke/vicuna-"
"7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-" "13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
"7B-v1.5-GGML)下载已转换的文件只需要一个文件。将其下载到models目录并将其重命名为ggml-model-q4_0.bin。" "13B-v1.5-GGUF)下载已转换的文件只需要一个文件。将其下载到models目录并将其重命名为 `ggml-"
"model-q4_0.gguf`。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:21 #: ../../getting_started/install/llm/llama/llama_cpp.md:21
#: 65fed5b7e95b4205b2b94596a21b6fe8 #: 380ebad2c5a04210a48c5d7a9913413d
msgid "Convert It Yourself" msgid "Convert It Yourself"
msgstr "Convert It Yourself" msgstr "自行转换"
#: ../../getting_started/install/llm/llama/llama_cpp.md:23 #: ../../getting_started/install/llm/llama/llama_cpp.md:23
#: 1421761d320046f79f725e64bd7d854c #: cf39ca73d9c6456794fb240b164b7cbb
msgid "" msgid ""
"You can convert the model file yourself according to the instructions in " "You can convert the model file yourself according to the instructions in "
"[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp" "[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
"#prepare-data--run), and put the converted file in the models directory " "#prepare-data--run), and put the converted file in the models directory "
"and rename it to `ggml-model-q4_0.bin`." "and rename it to `ggml-model-q4_0.gguf`."
msgstr "" msgstr ""
"您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp" "您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
"#prepare-data--run)中的说明自己转换模型文件然后将转换后的文件放入models目录中并将其重命名为ggml-" "#prepare-data--run)中的说明自行转换模型文件并把转换后的文件放在models目录中并重命名为`ggml-"
"model-q4_0.bin。" "model-q4_0.gguf`。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:25 #: ../../getting_started/install/llm/llama/llama_cpp.md:25
#: 850b1f8ef6be49b192e01c1b7d8f1f26 #: 363cbf1c0b4e4029982519238f776958
msgid "Installing Dependencies" msgid "Installing Dependencies"
msgstr "安装依赖" msgstr "安装依赖"
#: ../../getting_started/install/llm/llama/llama_cpp.md:27 #: ../../getting_started/install/llm/llama/llama_cpp.md:27
#: b323ee4799d745cc9c0a449bd37c371a #: a98c36e3d7df40f3a816c0ee451b6114
msgid "" msgid ""
"llama.cpp is an optional dependency in DB-GPT, and you can manually " "llama.cpp is an optional dependency in DB-GPT, and you can manually "
"install it using the following command:" "install it using the following command:"
msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过下命令进行安装" msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过下命令进行安装"
#: ../../getting_started/install/llm/llama/llama_cpp.md:33 #: ../../getting_started/install/llm/llama/llama_cpp.md:33
#: 75b75c84ffb7476d8501a28bb2719615 #: b0038a8ba36647c6a62eef907cb6d304
msgid "Modifying the Configuration File" msgid "Modifying the Configuration File"
msgstr "修改配置文件" msgstr "修改配置文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:35 #: ../../getting_started/install/llm/llama/llama_cpp.md:35
#: d1f8b3e1ad3441f2aafbfe2519113c2c #: d2002da716744122a44ab4ed2e47e680
msgid "Next, you can directly modify your `.env` file to enable llama.cpp." msgid "Next, you can directly modify your `.env` file to enable llama.cpp."
msgstr "修改`.env`文件使用llama.cpp" msgstr "修改`.env`文件使用llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:42 #: ../../getting_started/install/llm/llama/llama_cpp.md:42
#: 2ddcab3834f646e58a8b3316abf6ce3a #: 97a5fb5d4ed649f5aa0bbb97c32d54b0
msgid "" msgid ""
"Then you can run it according to [Run](https://db-" "Then you can run it according to [Run](https://db-"
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)." "gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)."
msgstr "" msgstr ""
"然后你可以通过[Run](https://db-" "然后你可以根据[运行]"
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run).来运行" "(https://db-gpt.readthedocs.io/projects/db-gpt-docs-zh-cn/zh_CN/latest/getting_started/install/deploy/deploy.html#run)来运行"
#: ../../getting_started/install/llm/llama/llama_cpp.md:45 #: ../../getting_started/install/llm/llama/llama_cpp.md:45
#: bb9f222d22534827a9fa164b2126d192 #: 0e3771b6aaa141f89c813507f3317bda
msgid "More Configurations" msgid "More Configurations"
msgstr "更多配置文件" msgstr "更多配置文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:47 #: ../../getting_started/install/llm/llama/llama_cpp.md:47
#: 14d016ad5bad451888d01e24f0ca86d9 #: 0802ba524cd1458298fe6f90ae7f2da1
msgid "" msgid ""
"In DB-GPT, the model configuration can be done through `{model " "In DB-GPT, the model configuration can be done through `{model "
"name}_{config key}`." "name}_{config key}`."
msgstr "" msgstr "在DB-GPT中模型配置可以通过`{模型名称}_{配置名}` 来配置。"
"In DB-GPT, the model configuration can be done through `{model "
"name}_{config key}`."
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: a1bf4c1f49bd4d97ac45d4f3aff442c6 #: d461d379a523424fb5885e393498ee14
msgid "Environment Variable Key" msgid "Environment Variable Key"
msgstr "Environment Variable Key" msgstr "环境变量键"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 92692a38219c432fadffb8b3825ce678 #: 0263477d0ddb4914baa0d3584b751086
msgid "default" msgid "default"
msgstr "default" msgstr "默认值"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 72b2d251aa2e4ca09c335b58e1a08de3 #: e5188d0ded6540a0bddb46d480f8b7ac
msgid "Prompt Template Name" msgid "Description"
msgstr "Prompt Template Name" msgstr "描述"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 85a9f89eeb9a4b70b56913354e947329 #: 213b27d0e53d4858b7576dc4f2ab4d7f
msgid "llama_cpp_prompt_template" msgid "llama_cpp_prompt_template"
msgstr "llama_cpp_prompt_template" msgstr "llama_cpp_prompt_template"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 17e9750fbb824dfdaaed5415f6406e35 602016763bb2470d8a8ef700e576407b #: 1cb0320826564a89a3e2f51177f8a6ed 23d93dc7d88e431ba31ff64d239a412f
#: 790caafd5c4c4cecbb4c190745fb994c ceb6c41315ab4c5798ab3c64ee8693eb #: 833d5012411a4ad58b04d50a40a29184 95aa2102191946919158ae668b2e3599
#: cfafab69a2684e27bd55aadfdd4c1575 #: becdd178292a48138dcb445ba3c2a6ec
msgid "None" msgid "None"
msgstr "None" msgstr "None"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 5d02f2d1d5834b1e9e5d6982247fd6c9 #: ac835806c79640aa8cd39edb11d7667c
msgid "" msgid ""
"Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2" "Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2"
",baichuan-chat`, If None, the prompt template is automatically determined" ",baichuan-chat,internlm-chat`, If None, the prompt template is "
" from model path。" "automatically determined from model path。"
msgstr "" msgstr ""
"Prompt template 现在可以支持`zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, " "Prompt template 现在可以支持`zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-"
"如果是None, the prompt template可以自动选择模型路径" "chat,internlm-chat`, 如果是None, 可以根据模型路径来自动获取模型 Prompt template"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 2a95bc11386f45498b3585b194f24c17 #: 41bce5a6bbf2417f8bc40e71c59405ad
msgid "llama_cpp_model_path" msgid "llama_cpp_model_path"
msgstr "llama_cpp_model_path" msgstr "llama_cpp_model_path"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: c02db8a50e7a4df0acb6b75798a3ad4b #: 15df4d19645b40e7a209827f9a325b8f
msgid "Model path" msgid "Model path"
msgstr "Model path" msgstr "模型路径"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 6c92b2ec52634728bcc421670cdda70b #: caf9ddbfb787418d8b167746e3febe8c
msgid "llama_cpp_n_gpu_layers" msgid "llama_cpp_n_gpu_layers"
msgstr "llama_cpp_n_gpu_layers" msgstr "llama_cpp_n_gpu_layers"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 9f1e1b763a0b40d28efd734fe20e1ba7 #: e12e0ed2c01e4d12b41d5da533073c53
msgid "1000000000" msgid "1000000000"
msgstr "1000000000" msgstr "1000000000"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 0f511b7907594c1f9c9818638764f209 #: 1f4a868d3fed4ac78bfa48e13b3a59dc
msgid "" msgid ""
"Number of layers to offload to the GPU, Set this to 1000000000 to offload" "Number of layers to offload to the GPU, Set this to 1000000000 to offload"
" all layers to the GPU. If your GPU VRAM is not enough, you can set a low" " all layers to the GPU. If your GPU VRAM is not enough, you can set a low"
" number, eg: `10`" " number, eg: `10`"
msgstr "要将层转移到GPU上将其设置为1000000000以将所有层转移到GPU上。如果您的GPU VRAM不足可以设置较低的数字例如10。" msgstr "要将多少网络层转移到GPU上将其设置为1000000000以将所有层转移到GPU上。如果您的 GPU 内存不足可以设置较低的数字例如10。"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 1ffdfa4eb78d4127b302b6d703852692 #: 306e083489e24f819d67f38e2f155f0f
msgid "llama_cpp_n_threads" msgid "llama_cpp_n_threads"
msgstr "llama_cpp_n_threads" msgstr "llama_cpp_n_threads"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: f14379e7ea16476da403d5085b67db1c #: 0490a543f67f4ecd8588541399846951
msgid "" msgid ""
"Number of threads to use. If None, the number of threads is automatically" "Number of threads to use. If None, the number of threads is automatically"
" determined" " determined"
msgstr "要使用的线程数量。如果为None则线程数量将自动确定。" msgstr "要使用的线程数量。如果为None则线程数量将自动确定。"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 41cc1035f6e340e19848452d48a161db #: 2ad3f09e1f894e30ae512e1cd803af52
msgid "llama_cpp_n_batch" msgid "llama_cpp_n_batch"
msgstr "llama_cpp_n_batch" msgstr "llama_cpp_n_batch"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 993c3b9218ee4299beae53bd75a01001 #: c495776868394df5b311087dfc7c55dd
msgid "512" msgid "512"
msgstr "512" msgstr "512"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 0e11d38c9b58478cacdade34de146320 #: b5e69dc488cc4ae78ee9daefcf73c290
msgid "Maximum number of prompt tokens to batch together when calling llama_eval" msgid "Maximum number of prompt tokens to batch together when calling llama_eval"
msgstr "在调用llama_eval时批处理在一起的prompt tokens的最大数量" msgstr "在调用llama_eval时批处理在一起的prompt tokens的最大数量"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 24f5381956d34569aabee4a5d832388b #: 516cfc3ed00c4a6181f37a4649c9f041
msgid "llama_cpp_n_gqa" msgid "llama_cpp_n_gqa"
msgstr "llama_cpp_n_gqa" msgstr "llama_cpp_n_gqa"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 07d05844541c452caaa8d5bf56c3f8a1 #: 51847a305c4341af8614a2ceb7aa658f
msgid "Grouped-query attention. Must be 8 for llama-2 70b." msgid "Grouped-query attention. Must be 8 for llama-2 70b."
msgstr "对于llama-2 70b模型Grouped-query attention必须为8。" msgstr "对于 llama-2 70B 模型Grouped-query attention 必须为8。"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 40a1b9750d854bb19dc18b7d530beccf #: 8261108709f341dab19e4fece7682c0c
msgid "llama_cpp_rms_norm_eps" msgid "llama_cpp_rms_norm_eps"
msgstr "llama_cpp_rms_norm_eps" msgstr "llama_cpp_rms_norm_eps"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 6018ee183b9548eabf91e9fc683e7c24 #: 72cc3d9988414f489ddefe3afb332e83
msgid "5e-06" msgid "5e-06"
msgstr "5e-06" msgstr "5e-06"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: eb273c6bcf2c4c47808024008ce230dc #: ebc1baebf57e4009b0fdfa68eb055d80
msgid "5e-6 is a good value for llama-2 models." msgid "5e-6 is a good value for llama-2 models."
msgstr "对于llama-2模型来说5e-6是一个不错的值。" msgstr "对于llama-2模型来说5e-6是一个不错的值。"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: f70f3e935b764b6f9544d201ba2aaa05 #: 0cc1199e293741f087c795230d9c8dda
msgid "llama_cpp_cache_capacity" msgid "llama_cpp_cache_capacity"
msgstr "llama_cpp_cache_capacity" msgstr "llama_cpp_cache_capacity"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 70035ec5be244eda9fe93be3df2c66df #: 7d13612da75046b1a3fc0877e229bb91
msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB" msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB"
msgstr "cache capacity最大值. Examples: 2000MiB, 2GiB" msgstr "模型缓存最大值. 例如: 2000MiB, 2GiB"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 164c31b005ae4979938d9bc67e7f2759 #: 53332858d3a8472f8eb59d845c594ffd
msgid "llama_cpp_prefer_cpu" msgid "llama_cpp_prefer_cpu"
msgstr "llama_cpp_prefer_cpu" msgstr "llama_cpp_prefer_cpu"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: 28f890f6bee3412e94aeb1326367326e #: 7ff31fe3233a4243840584bc069654cd
msgid "False" msgid "False"
msgstr "False" msgstr "False"
#: ../../getting_started/install/llm/llama/llama_cpp.md #: ../../getting_started/install/llm/llama/llama_cpp.md
#: f8f27b6323384431ba064a720f39f997 #: 62d1dbd4f8254141a697448a7a5f6701
msgid "" msgid ""
"If a GPU is available, it will be preferred by default, unless " "If a GPU is available, it will be preferred by default, unless "
"prefer_cpu=False is configured." "prefer_cpu=False is configured."
msgstr "如果有可用的GPU默认情况下会优先使用GPU除非配置了 prefer_cpu=False。" msgstr "如果有可用的GPU默认情况下会优先使用GPU除非配置了 prefer_cpu=False。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:61 #: ../../getting_started/install/llm/llama/llama_cpp.md:61
#: 0471e56c790047bab422aa47edad0a15 #: 8de97de28d1a40c3b852a1268255ebed
msgid "GPU Acceleration" msgid "GPU Acceleration"
msgstr "GPU 加速" msgstr "GPU 加速"
#: ../../getting_started/install/llm/llama/llama_cpp.md:63 #: ../../getting_started/install/llm/llama/llama_cpp.md:63
#: e95ad40d29004455bebeec8a1a7248c8 #: 8bce74c0ddb5486190ff4d36fd5358be
msgid "" msgid ""
"GPU acceleration is supported by default. If you encounter any issues, " "GPU acceleration is supported by default. If you encounter any issues, "
"you can uninstall the dependent packages with the following command:" "you can uninstall the dependent packages with the following command:"
msgstr "默认情况下支持GPU加速。如果遇到任何问题您可以使用以下命令卸载相关的依赖包" msgstr "默认情况下支持GPU加速。如果遇到任何问题您可以使用以下命令卸载相关的依赖包"
#: ../../getting_started/install/llm/llama/llama_cpp.md:68 #: ../../getting_started/install/llm/llama/llama_cpp.md:68
#: c0caf1420e43437589693ddec96bd50f #: 1f3fe88521614d499cb1d046f8d3c125
msgid "" msgid ""
"Then install `llama-cpp-python` according to the instructions in [llama-" "Then install `llama-cpp-python` according to the instructions in [llama-"
"cpp-python](https://github.com/abetlen/llama-cpp-" "cpp-python](https://github.com/abetlen/llama-cpp-"
@ -297,24 +298,24 @@ msgstr ""
"python/blob/main/README.md).安装`llama-cpp-python`" "python/blob/main/README.md).安装`llama-cpp-python`"
#: ../../getting_started/install/llm/llama/llama_cpp.md:71 #: ../../getting_started/install/llm/llama/llama_cpp.md:71
#: fe082f65b4e9416c97b18e5005bc0a59 #: fc83106f0a0e4ddfb3c058bec62f4568
msgid "Mac Usage" msgid "Mac Usage"
msgstr "Mac Usage" msgstr "Mac 使用"
#: ../../getting_started/install/llm/llama/llama_cpp.md:73 #: ../../getting_started/install/llm/llama/llama_cpp.md:73
#: 6f30d3fa399f434189fcb03d28a42d2d #: dcf5904a444342c8a768c4da8b777828
msgid "" msgid ""
"Special attention, if you are using Apple Silicon (M1) Mac, it is highly " "Special attention, if you are using Apple Silicon (M1) Mac, it is highly "
"recommended to install arm64 architecture python support, for example:" "recommended to install arm64 architecture python support, for example:"
msgstr "特别注意如果您正在使用苹果芯片M1的Mac电脑强烈建议安装 arm64 架构的 Python 支持,例如:" msgstr "特别注意如果您正在使用苹果芯片M1的Mac电脑强烈建议安装 arm64 架构的 Python 支持,例如:"
#: ../../getting_started/install/llm/llama/llama_cpp.md:80 #: ../../getting_started/install/llm/llama/llama_cpp.md:80
#: 74602bede3c5472fbabc7de47eb2ff7a #: 547369c011a9412589dad1fac7ac3ef9
msgid "Windows Usage" msgid "Windows Usage"
msgstr "Windows使用" msgstr "Windows使用"
#: ../../getting_started/install/llm/llama/llama_cpp.md:82 #: ../../getting_started/install/llm/llama/llama_cpp.md:82
#: ae78332a348b44cb847723a998b98048 #: 506fda57977f4aa8b9fe427e3c66f4d7
msgid "" msgid ""
"The use under the Windows platform has not been rigorously tested and " "The use under the Windows platform has not been rigorously tested and "
"verified, and you are welcome to use it. If you have any problems, you " "verified, and you are welcome to use it. If you have any problems, you "
@ -323,8 +324,8 @@ msgid ""
"information) directly." "information) directly."
msgstr "" msgstr ""
"在Windows平台上的使用尚未经过严格的测试和验证欢迎您使用。如果您有任何问题可以创建一个[issue](https://github.com" "在Windows平台上的使用尚未经过严格的测试和验证欢迎您使用。如果您有任何问题可以创建一个[issue](https://github.com"
"/eosphoros-ai/DB-GPT/issues)或者[contact us](https://github.com/eosphoros-" "/eosphoros-ai/DB-GPT/issues)或者直接[联系我们](https://github.com/eosphoros-ai"
"ai/DB-GPT/tree/main#contact-information) directly." "/DB-GPT/tree/main#cntact-information)。"
#~ msgid "" #~ msgid ""
#~ "DB-GPT is now supported by " #~ "DB-GPT is now supported by "
@ -337,3 +338,6 @@ msgstr ""
#~ "cpp-python) through " #~ "cpp-python) through "
#~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)." #~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)."
#~ msgid "Prompt Template Name"
#~ msgstr "Prompt Template Name"

View File

@ -70,7 +70,8 @@ LLM_MODEL_CONFIG = {
"baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"), "baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"),
# (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2 # (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2
"wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"), "wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"),
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.bin"), # wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.gguf"),
# https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288 # https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288
"internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"), "internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"),
"internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"), "internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"),

View File

@ -382,14 +382,14 @@ class LlamaCppAdapater(BaseLLMAdaper):
# Just support local model # Just support local model
return False, None return False, None
if not path.is_file(): if not path.is_file():
model_paths = list(path.glob("*ggml*.bin")) model_paths = list(path.glob("*ggml*.gguf"))
if not model_paths: if not model_paths:
return False return False
model_path = str(model_paths[0]) model_path = str(model_paths[0])
logger.warn( logger.warn(
f"Model path {model_path} is not single file, use first *gglm*.bin model file: {model_path}" f"Model path {model_path} is not single file, use first *gglm*.gguf model file: {model_path}"
) )
if not re.fullmatch(".*ggml.*\.bin", model_path): if not re.fullmatch(".*ggml.*\.gguf", model_path):
return False, None return False, None
return True, model_path return True, model_path

View File

@ -33,7 +33,7 @@ class WorkerManager(ABC):
"""Start worker manager""" """Start worker manager"""
@abstractmethod @abstractmethod
async def stop(self): async def stop(self, ignore_exception: bool = False):
"""Stop worker manager""" """Stop worker manager"""
@abstractmethod @abstractmethod

View File

@ -115,14 +115,30 @@ class LocalWorkerManager(WorkerManager):
for listener in self.start_listeners: for listener in self.start_listeners:
listener(self) listener(self)
async def stop(self): async def stop(self, ignore_exception: bool = False):
if not self.run_data.stop_event.is_set(): if not self.run_data.stop_event.is_set():
logger.info("Stop all workers") logger.info("Stop all workers")
self.run_data.stop_event.clear() self.run_data.stop_event.clear()
stop_tasks = [] stop_tasks = []
stop_tasks.append(self._stop_all_worker(apply_req=None)) stop_tasks.append(
self._stop_all_worker(apply_req=None, ignore_exception=ignore_exception)
)
if self.deregister_func: if self.deregister_func:
# If ignore_exception is True, use exception handling to ignore any exceptions raised from self.deregister_func
if ignore_exception:
async def safe_deregister_func(run_data):
try:
await self.deregister_func(run_data)
except Exception as e:
logger.warning(
f"Stop worker, ignored exception from deregister_func: {e}"
)
stop_tasks.append(safe_deregister_func(self.run_data))
else:
stop_tasks.append(self.deregister_func(self.run_data)) stop_tasks.append(self.deregister_func(self.run_data))
await asyncio.gather(*stop_tasks) await asyncio.gather(*stop_tasks)
def after_start(self, listener: Callable[["WorkerManager"], None]): def after_start(self, listener: Callable[["WorkerManager"], None]):
@ -424,7 +440,7 @@ class LocalWorkerManager(WorkerManager):
) )
async def _stop_all_worker( async def _stop_all_worker(
self, apply_req: WorkerApplyRequest self, apply_req: WorkerApplyRequest, ignore_exception: bool = False
) -> WorkerApplyOutput: ) -> WorkerApplyOutput:
start_time = time.time() start_time = time.time()
@ -441,7 +457,19 @@ class LocalWorkerManager(WorkerManager):
and self.register_func and self.register_func
and self.deregister_func and self.deregister_func
): ):
await self.deregister_func(worker_run_data) _deregister_func = self.deregister_func
if ignore_exception:
async def safe_deregister_func(run_data):
try:
await self.deregister_func(run_data)
except Exception as e:
logger.warning(
f"Stop worker, ignored exception from deregister_func: {e}"
)
_deregister_func = safe_deregister_func
await _deregister_func(worker_run_data)
await self._apply_worker(apply_req, _stop_worker) await self._apply_worker(apply_req, _stop_worker)
timecost = time.time() - start_time timecost = time.time() - start_time
@ -487,8 +515,8 @@ class WorkerManagerAdapter(WorkerManager):
async def start(self): async def start(self):
return await self.worker_manager.start() return await self.worker_manager.start()
async def stop(self): async def stop(self, ignore_exception: bool = False):
return await self.worker_manager.stop() return await self.worker_manager.stop(ignore_exception=ignore_exception)
def after_start(self, listener: Callable[["WorkerManager"], None]): def after_start(self, listener: Callable[["WorkerManager"], None]):
if listener is not None: if listener is not None:
@ -631,7 +659,9 @@ async def api_model_shutdown(request: WorkerStartupRequest):
return await worker_manager.model_shutdown(request) return await worker_manager.model_shutdown(request)
def _setup_fastapi(worker_params: ModelWorkerParameters, app=None): def _setup_fastapi(
worker_params: ModelWorkerParameters, app=None, ignore_exception: bool = False
):
if not app: if not app:
app = FastAPI() app = FastAPI()
if worker_params.standalone: if worker_params.standalone:
@ -666,7 +696,7 @@ def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
@app.on_event("shutdown") @app.on_event("shutdown")
async def startup_event(): async def startup_event():
await worker_manager.stop() await worker_manager.stop(ignore_exception=ignore_exception)
return app return app
@ -837,7 +867,7 @@ def initialize_worker_manager_in_client(
worker_params.register = True worker_params.register = True
worker_params.port = local_port worker_params.port = local_port
logger.info(f"Worker params: {worker_params}") logger.info(f"Worker params: {worker_params}")
_setup_fastapi(worker_params, app) _setup_fastapi(worker_params, app, ignore_exception=True)
_start_local_worker(worker_manager, worker_params) _start_local_worker(worker_manager, worker_params)
worker_manager.after_start(start_listener) worker_manager.after_start(start_listener)
_start_local_embedding_worker( _start_local_embedding_worker(

View File

@ -17,7 +17,7 @@ class RemoteWorkerManager(LocalWorkerManager):
for listener in self.start_listeners: for listener in self.start_listeners:
listener(self) listener(self)
async def stop(self): async def stop(self, ignore_exception: bool = False):
pass pass
async def _fetch_from_worker( async def _fetch_from_worker(

View File

@ -44,11 +44,6 @@ static_file_path = os.path.join(os.getcwd(), "server/static")
CFG = Config() CFG = Config()
def signal_handler():
print("in order to avoid chroma db atexit problem")
os._exit(0)
def swagger_monkey_patch(*args, **kwargs): def swagger_monkey_patch(*args, **kwargs):
return get_swagger_ui_html( return get_swagger_ui_html(
*args, *args,
@ -176,7 +171,6 @@ def run_uvicorn(param: WebWerverParameters):
port=param.port, port=param.port,
log_level=logging_str_to_uvicorn_level(param.log_level), log_level=logging_str_to_uvicorn_level(param.log_level),
) )
signal.signal(signal.SIGINT, signal_handler())
def run_webserver(param: WebWerverParameters = None): def run_webserver(param: WebWerverParameters = None):

View File

@ -14,7 +14,10 @@ from setuptools import find_packages
with open("README.md", mode="r", encoding="utf-8") as fh: with open("README.md", mode="r", encoding="utf-8") as fh:
long_description = fh.read() long_description = fh.read()
BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true" BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "true").lower() == "true"
LLAMA_CPP_GPU_ACCELERATION = (
os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true"
)
def parse_requirements(file_name: str) -> List[str]: def parse_requirements(file_name: str) -> List[str]:
@ -249,21 +252,29 @@ def llama_cpp_python_cuda_requires():
if not cuda_version: if not cuda_version:
print("CUDA not support, use cpu version") print("CUDA not support, use cpu version")
return return
if not LLAMA_CPP_GPU_ACCELERATION:
print("Disable GPU acceleration")
return
# Supports GPU acceleration
device = "cu" + cuda_version.replace(".", "") device = "cu" + cuda_version.replace(".", "")
os_type, cpu_avx = get_cpu_avx_support() os_type, cpu_avx = get_cpu_avx_support()
print(f"OS: {os_type}, cpu avx: {cpu_avx}")
supported_os = [OSType.WINDOWS, OSType.LINUX] supported_os = [OSType.WINDOWS, OSType.LINUX]
if os_type not in supported_os: if os_type not in supported_os:
print( print(
f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}" f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}"
) )
return return
if cpu_avx == AVXType.AVX2 or AVXType.AVX512: cpu_device = ""
cpu_avx = AVXType.AVX if cpu_avx == AVXType.AVX2 or cpu_avx == AVXType.AVX512:
cpu_avx = cpu_avx._value_ cpu_device = "avx"
else:
cpu_device = "basic"
device += cpu_device
base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui" base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui"
llama_cpp_version = "0.1.77" llama_cpp_version = "0.2.10"
py_version = "cp310" py_version = "cp310"
os_pkg_name = "linux_x86_64" if os_type == OSType.LINUX else "win_amd64" os_pkg_name = "manylinux_2_31_x86_64" if os_type == OSType.LINUX else "win_amd64"
extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl" extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl"
extra_index_url, _ = encode_url(extra_index_url) extra_index_url, _ = encode_url(extra_index_url)
print(f"Install llama_cpp_python_cuda from {extra_index_url}") print(f"Install llama_cpp_python_cuda from {extra_index_url}")
@ -298,7 +309,7 @@ def core_requires():
"langchain>=0.0.286", "langchain>=0.0.286",
"SQLAlchemy", "SQLAlchemy",
"pymysql", "pymysql",
"duckdb", "duckdb==0.8.1",
"duckdb-engine", "duckdb-engine",
"jsonschema", "jsonschema",
# TODO move transformers to default # TODO move transformers to default
@ -312,7 +323,6 @@ def knowledge_requires():
""" """
setup_spec.extras["knowledge"] = [ setup_spec.extras["knowledge"] = [
"spacy==3.5.3", "spacy==3.5.3",
# "chromadb==0.3.22",
"chromadb==0.4.10", "chromadb==0.4.10",
"markdown", "markdown",
"bs4", "bs4",