feat(model): llama.cpp support new GGUF file format (#649)

Close #567 
Close #644
Close #563

**Other**
- Fix raise Exception when stop DB-GPT
This commit is contained in:
Aries-ckt 2023-10-07 22:50:57 +08:00 committed by GitHub
commit f2427b10f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 168 additions and 129 deletions

View File

@ -44,7 +44,7 @@ QUANTIZE_8bit=True
## llama-2-70b must be 8
# llama_cpp_n_gqa=8
## Model path
# llama_cpp_model_path=/data/models/TheBloke/vicuna-7B-v1.5-GGML/vicuna-7b-v1.5.ggmlv3.q4_0.bin
# llama_cpp_model_path=/data/models/TheBloke/vicuna-13B-v1.5-GGUF/vicuna-13b-v1.5.Q4_K_M.gguf
#*******************************************************************#
#** EMBEDDING SETTINGS **#

View File

@ -8,19 +8,19 @@ DB-GPT already supports [llama.cpp](https://github.com/ggerganov/llama.cpp) via
### Preparing Model Files
To use llama.cpp, you need to prepare a ggml format model file, and there are two common ways to obtain it, you can choose either:
To use llama.cpp, you need to prepare a gguf format model file, and there are two common ways to obtain it, you can choose either:
1. Download a pre-converted model file.
Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), you can download the file already converted from [TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.bin`.
Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5), you can download the file already converted from [TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.gguf`.
```bash
wget https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML/resolve/main/vicuna-7b-v1.5.ggmlv3.q4_K_M.bin -O models/ggml-model-q4_0.bin
wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
```
2. Convert It Yourself
You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.bin`.
You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.gguf`.
### Installing Dependencies
@ -46,9 +46,9 @@ Then you can run it according to [Run](https://db-gpt.readthedocs.io/en/latest/g
In DB-GPT, the model configuration can be done through `{model name}_{config key}`.
| Environment Variable Key | default | Prompt Template Name|
| Environment Variable Key | default | Description |
|----------|-----------| ----------- |
| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, If None, the prompt template is automatically determined from model path。 |
| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-chat,internlm-chat`, If None, the prompt template is automatically determined from model path。 |
| llama_cpp_model_path | None | Model path |
| llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` |
| llama_cpp_n_threads | None | Number of threads to use. If None, the number of threads is automatically determined |

View File

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: DB-GPT 👏👏 0.3.5\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-08-21 16:59+0800\n"
"POT-Creation-Date: 2023-10-07 20:28+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: zh_CN\n"
@ -20,274 +20,275 @@ msgstr ""
"Generated-By: Babel 2.12.1\n"
#: ../../getting_started/install/llm/llama/llama_cpp.md:1
#: 24d5c21cd8b44f1d8585ba5c83e34acc
#: 95a9a605d97346fb98e0c0977524d354
msgid "llama.cpp"
msgstr "llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:5
#: 56969ff863d949aa8df55d3bdb6957e7
#: ebe3be273a42492d9832512554b4b7dc
msgid ""
"DB-GPT already supports "
"[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-"
"python](https://github.com/abetlen/llama-cpp-python)."
msgstr ""
"DB-GPT已经通过[llama-cpp-python](https://github.com/abetlen/llama-cpp-"
"python)支持[llama.cpp](https://github.com/ggerganov/llama.cpp)。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:7
#: afe223eafcc641779e1580cac574c34a
#: 97a4f6f95d6845258e3753803fc117a3
msgid "Running llama.cpp"
msgstr "运行 llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:9
#: 0eaf98a036434eecb2af1fa89f045620
#: 40fcdf93fe3d4542bbd84ed2d5a82623
msgid "Preparing Model Files"
msgstr "准备模型文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:11
#: 4f45be5d9658451fb95f1d5d31dc8778
#: f10bd034d24640d3b83572d50b2a9f71
msgid ""
"To use llama.cpp, you need to prepare a ggml format model file, and there"
"To use llama.cpp, you need to prepare a gguf format model file, and there"
" are two common ways to obtain it, you can choose either:"
msgstr "使用llama.cpp, 你需要准备ggml格式的文件,你可以通过以下两种方法获取"
msgstr "使用 llama.cpp你需要准备 gguf 格式的文件,你可以通过以下两种方法获取"
#: ../../getting_started/install/llm/llama/llama_cpp.md:13
#: 9934596e0f6e466aae63cefbb019e0ec
#: fb143586b13849f0bb2b6ae0c9408e95
msgid "Download a pre-converted model file."
msgstr "Download a pre-converted model file."
msgstr "下载已转换的模型文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:15
#: 33fef76961064a5ca4c86c57111c8bd3
#: a6e89c960ebd4778b8fc72d3d43e9543
msgid ""
"Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys"
"/vicuna-7b-v1.5), you can download the file already converted from "
"[TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
"7B-v1.5-GGML), only one file is needed. Download it to the `models` "
"directory and rename it to `ggml-model-q4_0.bin`."
"Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys"
"/vicuna-13b-v1.5), you can download the file already converted from "
"[TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
"13B-v1.5-GGUF), only one file is needed. Download it to the `models` "
"directory and rename it to `ggml-model-q4_0.gguf`."
msgstr ""
"假设您想使用[Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-"
"7b-v1.5)您可以从[TheBloke/vicuna-"
"7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
"7B-v1.5-GGML)下载已转换的文件只需要一个文件。将其下载到models目录并将其重命名为ggml-model-q4_0.bin。"
"假设您想使用[Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-"
"13b-v1.5)您可以从[TheBloke/vicuna-"
"13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
"13B-v1.5-GGUF)下载已转换的文件只需要一个文件。将其下载到models目录并将其重命名为 `ggml-"
"model-q4_0.gguf`。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:21
#: 65fed5b7e95b4205b2b94596a21b6fe8
#: 380ebad2c5a04210a48c5d7a9913413d
msgid "Convert It Yourself"
msgstr "Convert It Yourself"
msgstr "自行转换"
#: ../../getting_started/install/llm/llama/llama_cpp.md:23
#: 1421761d320046f79f725e64bd7d854c
#: cf39ca73d9c6456794fb240b164b7cbb
msgid ""
"You can convert the model file yourself according to the instructions in "
"[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
"#prepare-data--run), and put the converted file in the models directory "
"and rename it to `ggml-model-q4_0.bin`."
"and rename it to `ggml-model-q4_0.gguf`."
msgstr ""
"您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
"#prepare-data--run)中的说明自己转换模型文件然后将转换后的文件放入models目录中并将其重命名为ggml-"
"model-q4_0.bin。"
"#prepare-data--run)中的说明自行转换模型文件并把转换后的文件放在models目录中并重命名为`ggml-"
"model-q4_0.gguf`。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:25
#: 850b1f8ef6be49b192e01c1b7d8f1f26
#: 363cbf1c0b4e4029982519238f776958
msgid "Installing Dependencies"
msgstr "安装依赖"
#: ../../getting_started/install/llm/llama/llama_cpp.md:27
#: b323ee4799d745cc9c0a449bd37c371a
#: a98c36e3d7df40f3a816c0ee451b6114
msgid ""
"llama.cpp is an optional dependency in DB-GPT, and you can manually "
"install it using the following command:"
msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过下命令进行安装"
msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过下命令进行安装"
#: ../../getting_started/install/llm/llama/llama_cpp.md:33
#: 75b75c84ffb7476d8501a28bb2719615
#: b0038a8ba36647c6a62eef907cb6d304
msgid "Modifying the Configuration File"
msgstr "修改配置文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:35
#: d1f8b3e1ad3441f2aafbfe2519113c2c
#: d2002da716744122a44ab4ed2e47e680
msgid "Next, you can directly modify your `.env` file to enable llama.cpp."
msgstr "修改`.env`文件使用llama.cpp"
#: ../../getting_started/install/llm/llama/llama_cpp.md:42
#: 2ddcab3834f646e58a8b3316abf6ce3a
#: 97a5fb5d4ed649f5aa0bbb97c32d54b0
msgid ""
"Then you can run it according to [Run](https://db-"
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)."
msgstr ""
"然后你可以通过[Run](https://db-"
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run).来运行"
"然后你可以根据[运行]"
"(https://db-gpt.readthedocs.io/projects/db-gpt-docs-zh-cn/zh_CN/latest/getting_started/install/deploy/deploy.html#run)来运行"
#: ../../getting_started/install/llm/llama/llama_cpp.md:45
#: bb9f222d22534827a9fa164b2126d192
#: 0e3771b6aaa141f89c813507f3317bda
msgid "More Configurations"
msgstr "更多配置文件"
#: ../../getting_started/install/llm/llama/llama_cpp.md:47
#: 14d016ad5bad451888d01e24f0ca86d9
#: 0802ba524cd1458298fe6f90ae7f2da1
msgid ""
"In DB-GPT, the model configuration can be done through `{model "
"name}_{config key}`."
msgstr ""
"In DB-GPT, the model configuration can be done through `{model "
"name}_{config key}`."
msgstr "在DB-GPT中模型配置可以通过`{模型名称}_{配置名}` 来配置。"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: a1bf4c1f49bd4d97ac45d4f3aff442c6
#: d461d379a523424fb5885e393498ee14
msgid "Environment Variable Key"
msgstr "Environment Variable Key"
msgstr "环境变量键"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 92692a38219c432fadffb8b3825ce678
#: 0263477d0ddb4914baa0d3584b751086
msgid "default"
msgstr "default"
msgstr "默认值"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 72b2d251aa2e4ca09c335b58e1a08de3
msgid "Prompt Template Name"
msgstr "Prompt Template Name"
#: e5188d0ded6540a0bddb46d480f8b7ac
msgid "Description"
msgstr "描述"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 85a9f89eeb9a4b70b56913354e947329
#: 213b27d0e53d4858b7576dc4f2ab4d7f
msgid "llama_cpp_prompt_template"
msgstr "llama_cpp_prompt_template"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 17e9750fbb824dfdaaed5415f6406e35 602016763bb2470d8a8ef700e576407b
#: 790caafd5c4c4cecbb4c190745fb994c ceb6c41315ab4c5798ab3c64ee8693eb
#: cfafab69a2684e27bd55aadfdd4c1575
#: 1cb0320826564a89a3e2f51177f8a6ed 23d93dc7d88e431ba31ff64d239a412f
#: 833d5012411a4ad58b04d50a40a29184 95aa2102191946919158ae668b2e3599
#: becdd178292a48138dcb445ba3c2a6ec
msgid "None"
msgstr "None"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 5d02f2d1d5834b1e9e5d6982247fd6c9
#: ac835806c79640aa8cd39edb11d7667c
msgid ""
"Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2"
",baichuan-chat`, If None, the prompt template is automatically determined"
" from model path。"
"Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2"
",baichuan-chat,internlm-chat`, If None, the prompt template is "
"automatically determined from model path。"
msgstr ""
"Prompt template 现在可以支持`zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, "
"如果是None, the prompt template可以自动选择模型路径"
"Prompt template 现在可以支持`zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-"
"chat,internlm-chat`, 如果是None, 可以根据模型路径来自动获取模型 Prompt template"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 2a95bc11386f45498b3585b194f24c17
#: 41bce5a6bbf2417f8bc40e71c59405ad
msgid "llama_cpp_model_path"
msgstr "llama_cpp_model_path"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: c02db8a50e7a4df0acb6b75798a3ad4b
#: 15df4d19645b40e7a209827f9a325b8f
msgid "Model path"
msgstr "Model path"
msgstr "模型路径"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 6c92b2ec52634728bcc421670cdda70b
#: caf9ddbfb787418d8b167746e3febe8c
msgid "llama_cpp_n_gpu_layers"
msgstr "llama_cpp_n_gpu_layers"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 9f1e1b763a0b40d28efd734fe20e1ba7
#: e12e0ed2c01e4d12b41d5da533073c53
msgid "1000000000"
msgstr "1000000000"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 0f511b7907594c1f9c9818638764f209
#: 1f4a868d3fed4ac78bfa48e13b3a59dc
msgid ""
"Number of layers to offload to the GPU, Set this to 1000000000 to offload"
" all layers to the GPU. If your GPU VRAM is not enough, you can set a low"
" number, eg: `10`"
msgstr "要将层转移到GPU上将其设置为1000000000以将所有层转移到GPU上。如果您的GPU VRAM不足可以设置较低的数字例如10。"
msgstr "要将多少网络层转移到GPU上将其设置为1000000000以将所有层转移到GPU上。如果您的 GPU 内存不足可以设置较低的数字例如10。"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 1ffdfa4eb78d4127b302b6d703852692
#: 306e083489e24f819d67f38e2f155f0f
msgid "llama_cpp_n_threads"
msgstr "llama_cpp_n_threads"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: f14379e7ea16476da403d5085b67db1c
#: 0490a543f67f4ecd8588541399846951
msgid ""
"Number of threads to use. If None, the number of threads is automatically"
" determined"
msgstr "要使用的线程数量。如果为None则线程数量将自动确定。"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 41cc1035f6e340e19848452d48a161db
#: 2ad3f09e1f894e30ae512e1cd803af52
msgid "llama_cpp_n_batch"
msgstr "llama_cpp_n_batch"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 993c3b9218ee4299beae53bd75a01001
#: c495776868394df5b311087dfc7c55dd
msgid "512"
msgstr "512"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 0e11d38c9b58478cacdade34de146320
#: b5e69dc488cc4ae78ee9daefcf73c290
msgid "Maximum number of prompt tokens to batch together when calling llama_eval"
msgstr "在调用llama_eval时批处理在一起的prompt tokens的最大数量"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 24f5381956d34569aabee4a5d832388b
#: 516cfc3ed00c4a6181f37a4649c9f041
msgid "llama_cpp_n_gqa"
msgstr "llama_cpp_n_gqa"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 07d05844541c452caaa8d5bf56c3f8a1
#: 51847a305c4341af8614a2ceb7aa658f
msgid "Grouped-query attention. Must be 8 for llama-2 70b."
msgstr "对于llama-2 70b模型Grouped-query attention必须为8。"
msgstr "对于 llama-2 70B 模型Grouped-query attention 必须为8。"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 40a1b9750d854bb19dc18b7d530beccf
#: 8261108709f341dab19e4fece7682c0c
msgid "llama_cpp_rms_norm_eps"
msgstr "llama_cpp_rms_norm_eps"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 6018ee183b9548eabf91e9fc683e7c24
#: 72cc3d9988414f489ddefe3afb332e83
msgid "5e-06"
msgstr "5e-06"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: eb273c6bcf2c4c47808024008ce230dc
#: ebc1baebf57e4009b0fdfa68eb055d80
msgid "5e-6 is a good value for llama-2 models."
msgstr "对于llama-2模型来说5e-6是一个不错的值。"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: f70f3e935b764b6f9544d201ba2aaa05
#: 0cc1199e293741f087c795230d9c8dda
msgid "llama_cpp_cache_capacity"
msgstr "llama_cpp_cache_capacity"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 70035ec5be244eda9fe93be3df2c66df
#: 7d13612da75046b1a3fc0877e229bb91
msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB"
msgstr "cache capacity最大值. Examples: 2000MiB, 2GiB"
msgstr "模型缓存最大值. 例如: 2000MiB, 2GiB"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 164c31b005ae4979938d9bc67e7f2759
#: 53332858d3a8472f8eb59d845c594ffd
msgid "llama_cpp_prefer_cpu"
msgstr "llama_cpp_prefer_cpu"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: 28f890f6bee3412e94aeb1326367326e
#: 7ff31fe3233a4243840584bc069654cd
msgid "False"
msgstr "False"
#: ../../getting_started/install/llm/llama/llama_cpp.md
#: f8f27b6323384431ba064a720f39f997
#: 62d1dbd4f8254141a697448a7a5f6701
msgid ""
"If a GPU is available, it will be preferred by default, unless "
"prefer_cpu=False is configured."
msgstr "如果有可用的GPU默认情况下会优先使用GPU除非配置了prefer_cpu=False。"
msgstr "如果有可用的GPU默认情况下会优先使用GPU除非配置了 prefer_cpu=False。"
#: ../../getting_started/install/llm/llama/llama_cpp.md:61
#: 0471e56c790047bab422aa47edad0a15
#: 8de97de28d1a40c3b852a1268255ebed
msgid "GPU Acceleration"
msgstr "GPU 加速"
#: ../../getting_started/install/llm/llama/llama_cpp.md:63
#: e95ad40d29004455bebeec8a1a7248c8
#: 8bce74c0ddb5486190ff4d36fd5358be
msgid ""
"GPU acceleration is supported by default. If you encounter any issues, "
"you can uninstall the dependent packages with the following command:"
msgstr "默认情况下支持GPU加速。如果遇到任何问题您可以使用以下命令卸载相关的依赖包"
#: ../../getting_started/install/llm/llama/llama_cpp.md:68
#: c0caf1420e43437589693ddec96bd50f
#: 1f3fe88521614d499cb1d046f8d3c125
msgid ""
"Then install `llama-cpp-python` according to the instructions in [llama-"
"cpp-python](https://github.com/abetlen/llama-cpp-"
@ -297,24 +298,24 @@ msgstr ""
"python/blob/main/README.md).安装`llama-cpp-python`"
#: ../../getting_started/install/llm/llama/llama_cpp.md:71
#: fe082f65b4e9416c97b18e5005bc0a59
#: fc83106f0a0e4ddfb3c058bec62f4568
msgid "Mac Usage"
msgstr "Mac Usage"
msgstr "Mac 使用"
#: ../../getting_started/install/llm/llama/llama_cpp.md:73
#: 6f30d3fa399f434189fcb03d28a42d2d
#: dcf5904a444342c8a768c4da8b777828
msgid ""
"Special attention, if you are using Apple Silicon (M1) Mac, it is highly "
"recommended to install arm64 architecture python support, for example:"
msgstr "特别注意如果您正在使用苹果芯片M1的Mac电脑强烈建议安装arm64架构的Python支持例如"
msgstr "特别注意如果您正在使用苹果芯片M1的Mac电脑强烈建议安装 arm64 架构的 Python 支持,例如:"
#: ../../getting_started/install/llm/llama/llama_cpp.md:80
#: 74602bede3c5472fbabc7de47eb2ff7a
#: 547369c011a9412589dad1fac7ac3ef9
msgid "Windows Usage"
msgstr "Windows使用"
#: ../../getting_started/install/llm/llama/llama_cpp.md:82
#: ae78332a348b44cb847723a998b98048
#: 506fda57977f4aa8b9fe427e3c66f4d7
msgid ""
"The use under the Windows platform has not been rigorously tested and "
"verified, and you are welcome to use it. If you have any problems, you "
@ -323,8 +324,8 @@ msgid ""
"information) directly."
msgstr ""
"在Windows平台上的使用尚未经过严格的测试和验证欢迎您使用。如果您有任何问题可以创建一个[issue](https://github.com"
"/eosphoros-ai/DB-GPT/issues)或者[contact us](https://github.com/eosphoros-"
"ai/DB-GPT/tree/main#contact-information) directly."
"/eosphoros-ai/DB-GPT/issues)或者直接[联系我们](https://github.com/eosphoros-ai"
"/DB-GPT/tree/main#cntact-information)。"
#~ msgid ""
#~ "DB-GPT is now supported by "
@ -337,3 +338,6 @@ msgstr ""
#~ "cpp-python) through "
#~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)."
#~ msgid "Prompt Template Name"
#~ msgstr "Prompt Template Name"

View File

@ -70,7 +70,8 @@ LLM_MODEL_CONFIG = {
"baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"),
# (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2
"wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"),
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.bin"),
# wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.gguf"),
# https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288
"internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"),
"internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"),

View File

@ -382,14 +382,14 @@ class LlamaCppAdapater(BaseLLMAdaper):
# Just support local model
return False, None
if not path.is_file():
model_paths = list(path.glob("*ggml*.bin"))
model_paths = list(path.glob("*ggml*.gguf"))
if not model_paths:
return False
model_path = str(model_paths[0])
logger.warn(
f"Model path {model_path} is not single file, use first *gglm*.bin model file: {model_path}"
f"Model path {model_path} is not single file, use first *gglm*.gguf model file: {model_path}"
)
if not re.fullmatch(".*ggml.*\.bin", model_path):
if not re.fullmatch(".*ggml.*\.gguf", model_path):
return False, None
return True, model_path

View File

@ -33,7 +33,7 @@ class WorkerManager(ABC):
"""Start worker manager"""
@abstractmethod
async def stop(self):
async def stop(self, ignore_exception: bool = False):
"""Stop worker manager"""
@abstractmethod

View File

@ -115,14 +115,30 @@ class LocalWorkerManager(WorkerManager):
for listener in self.start_listeners:
listener(self)
async def stop(self):
async def stop(self, ignore_exception: bool = False):
if not self.run_data.stop_event.is_set():
logger.info("Stop all workers")
self.run_data.stop_event.clear()
stop_tasks = []
stop_tasks.append(self._stop_all_worker(apply_req=None))
stop_tasks.append(
self._stop_all_worker(apply_req=None, ignore_exception=ignore_exception)
)
if self.deregister_func:
stop_tasks.append(self.deregister_func(self.run_data))
# If ignore_exception is True, use exception handling to ignore any exceptions raised from self.deregister_func
if ignore_exception:
async def safe_deregister_func(run_data):
try:
await self.deregister_func(run_data)
except Exception as e:
logger.warning(
f"Stop worker, ignored exception from deregister_func: {e}"
)
stop_tasks.append(safe_deregister_func(self.run_data))
else:
stop_tasks.append(self.deregister_func(self.run_data))
await asyncio.gather(*stop_tasks)
def after_start(self, listener: Callable[["WorkerManager"], None]):
@ -424,7 +440,7 @@ class LocalWorkerManager(WorkerManager):
)
async def _stop_all_worker(
self, apply_req: WorkerApplyRequest
self, apply_req: WorkerApplyRequest, ignore_exception: bool = False
) -> WorkerApplyOutput:
start_time = time.time()
@ -441,7 +457,19 @@ class LocalWorkerManager(WorkerManager):
and self.register_func
and self.deregister_func
):
await self.deregister_func(worker_run_data)
_deregister_func = self.deregister_func
if ignore_exception:
async def safe_deregister_func(run_data):
try:
await self.deregister_func(run_data)
except Exception as e:
logger.warning(
f"Stop worker, ignored exception from deregister_func: {e}"
)
_deregister_func = safe_deregister_func
await _deregister_func(worker_run_data)
await self._apply_worker(apply_req, _stop_worker)
timecost = time.time() - start_time
@ -487,8 +515,8 @@ class WorkerManagerAdapter(WorkerManager):
async def start(self):
return await self.worker_manager.start()
async def stop(self):
return await self.worker_manager.stop()
async def stop(self, ignore_exception: bool = False):
return await self.worker_manager.stop(ignore_exception=ignore_exception)
def after_start(self, listener: Callable[["WorkerManager"], None]):
if listener is not None:
@ -631,7 +659,9 @@ async def api_model_shutdown(request: WorkerStartupRequest):
return await worker_manager.model_shutdown(request)
def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
def _setup_fastapi(
worker_params: ModelWorkerParameters, app=None, ignore_exception: bool = False
):
if not app:
app = FastAPI()
if worker_params.standalone:
@ -666,7 +696,7 @@ def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
@app.on_event("shutdown")
async def startup_event():
await worker_manager.stop()
await worker_manager.stop(ignore_exception=ignore_exception)
return app
@ -837,7 +867,7 @@ def initialize_worker_manager_in_client(
worker_params.register = True
worker_params.port = local_port
logger.info(f"Worker params: {worker_params}")
_setup_fastapi(worker_params, app)
_setup_fastapi(worker_params, app, ignore_exception=True)
_start_local_worker(worker_manager, worker_params)
worker_manager.after_start(start_listener)
_start_local_embedding_worker(

View File

@ -17,7 +17,7 @@ class RemoteWorkerManager(LocalWorkerManager):
for listener in self.start_listeners:
listener(self)
async def stop(self):
async def stop(self, ignore_exception: bool = False):
pass
async def _fetch_from_worker(

View File

@ -44,11 +44,6 @@ static_file_path = os.path.join(os.getcwd(), "server/static")
CFG = Config()
def signal_handler():
print("in order to avoid chroma db atexit problem")
os._exit(0)
def swagger_monkey_patch(*args, **kwargs):
return get_swagger_ui_html(
*args,
@ -176,7 +171,6 @@ def run_uvicorn(param: WebWerverParameters):
port=param.port,
log_level=logging_str_to_uvicorn_level(param.log_level),
)
signal.signal(signal.SIGINT, signal_handler())
def run_webserver(param: WebWerverParameters = None):

View File

@ -14,7 +14,10 @@ from setuptools import find_packages
with open("README.md", mode="r", encoding="utf-8") as fh:
long_description = fh.read()
BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true"
BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "true").lower() == "true"
LLAMA_CPP_GPU_ACCELERATION = (
os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true"
)
def parse_requirements(file_name: str) -> List[str]:
@ -249,21 +252,29 @@ def llama_cpp_python_cuda_requires():
if not cuda_version:
print("CUDA not support, use cpu version")
return
if not LLAMA_CPP_GPU_ACCELERATION:
print("Disable GPU acceleration")
return
# Supports GPU acceleration
device = "cu" + cuda_version.replace(".", "")
os_type, cpu_avx = get_cpu_avx_support()
print(f"OS: {os_type}, cpu avx: {cpu_avx}")
supported_os = [OSType.WINDOWS, OSType.LINUX]
if os_type not in supported_os:
print(
f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}"
)
return
if cpu_avx == AVXType.AVX2 or AVXType.AVX512:
cpu_avx = AVXType.AVX
cpu_avx = cpu_avx._value_
cpu_device = ""
if cpu_avx == AVXType.AVX2 or cpu_avx == AVXType.AVX512:
cpu_device = "avx"
else:
cpu_device = "basic"
device += cpu_device
base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui"
llama_cpp_version = "0.1.77"
llama_cpp_version = "0.2.10"
py_version = "cp310"
os_pkg_name = "linux_x86_64" if os_type == OSType.LINUX else "win_amd64"
os_pkg_name = "manylinux_2_31_x86_64" if os_type == OSType.LINUX else "win_amd64"
extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl"
extra_index_url, _ = encode_url(extra_index_url)
print(f"Install llama_cpp_python_cuda from {extra_index_url}")
@ -298,7 +309,7 @@ def core_requires():
"langchain>=0.0.286",
"SQLAlchemy",
"pymysql",
"duckdb",
"duckdb==0.8.1",
"duckdb-engine",
"jsonschema",
# TODO move transformers to default
@ -312,7 +323,6 @@ def knowledge_requires():
"""
setup_spec.extras["knowledge"] = [
"spacy==3.5.3",
# "chromadb==0.3.22",
"chromadb==0.4.10",
"markdown",
"bs4",