mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-01 16:18:27 +00:00
feat(model): llama.cpp support new GGUF file format (#649)
Close #567 Close #644 Close #563 **Other** - Fix raise Exception when stop DB-GPT
This commit is contained in:
commit
f2427b10f5
@ -44,7 +44,7 @@ QUANTIZE_8bit=True
|
||||
## llama-2-70b must be 8
|
||||
# llama_cpp_n_gqa=8
|
||||
## Model path
|
||||
# llama_cpp_model_path=/data/models/TheBloke/vicuna-7B-v1.5-GGML/vicuna-7b-v1.5.ggmlv3.q4_0.bin
|
||||
# llama_cpp_model_path=/data/models/TheBloke/vicuna-13B-v1.5-GGUF/vicuna-13b-v1.5.Q4_K_M.gguf
|
||||
|
||||
#*******************************************************************#
|
||||
#** EMBEDDING SETTINGS **#
|
||||
|
@ -8,19 +8,19 @@ DB-GPT already supports [llama.cpp](https://github.com/ggerganov/llama.cpp) via
|
||||
|
||||
### Preparing Model Files
|
||||
|
||||
To use llama.cpp, you need to prepare a ggml format model file, and there are two common ways to obtain it, you can choose either:
|
||||
To use llama.cpp, you need to prepare a gguf format model file, and there are two common ways to obtain it, you can choose either:
|
||||
|
||||
1. Download a pre-converted model file.
|
||||
|
||||
Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5), you can download the file already converted from [TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.bin`.
|
||||
Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5), you can download the file already converted from [TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF), only one file is needed. Download it to the `models` directory and rename it to `ggml-model-q4_0.gguf`.
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/TheBloke/vicuna-7B-v1.5-GGML/resolve/main/vicuna-7b-v1.5.ggmlv3.q4_K_M.bin -O models/ggml-model-q4_0.bin
|
||||
wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
|
||||
```
|
||||
|
||||
2. Convert It Yourself
|
||||
|
||||
You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.bin`.
|
||||
You can convert the model file yourself according to the instructions in [llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp#prepare-data--run), and put the converted file in the models directory and rename it to `ggml-model-q4_0.gguf`.
|
||||
|
||||
### Installing Dependencies
|
||||
|
||||
@ -46,9 +46,9 @@ Then you can run it according to [Run](https://db-gpt.readthedocs.io/en/latest/g
|
||||
|
||||
In DB-GPT, the model configuration can be done through `{model name}_{config key}`.
|
||||
|
||||
| Environment Variable Key | default | Prompt Template Name|
|
||||
| Environment Variable Key | default | Description |
|
||||
|----------|-----------| ----------- |
|
||||
| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, If None, the prompt template is automatically determined from model path。 |
|
||||
| llama_cpp_prompt_template | None | Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-chat,internlm-chat`, If None, the prompt template is automatically determined from model path。 |
|
||||
| llama_cpp_model_path | None | Model path |
|
||||
| llama_cpp_n_gpu_layers | 1000000000 |Number of layers to offload to the GPU, Set this to 1000000000 to offload all layers to the GPU. If your GPU VRAM is not enough, you can set a low number, eg: `10` |
|
||||
| llama_cpp_n_threads | None | Number of threads to use. If None, the number of threads is automatically determined |
|
||||
|
@ -8,7 +8,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: DB-GPT 👏👏 0.3.5\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2023-08-21 16:59+0800\n"
|
||||
"POT-Creation-Date: 2023-10-07 20:28+0800\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: zh_CN\n"
|
||||
@ -20,274 +20,275 @@ msgstr ""
|
||||
"Generated-By: Babel 2.12.1\n"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:1
|
||||
#: 24d5c21cd8b44f1d8585ba5c83e34acc
|
||||
#: 95a9a605d97346fb98e0c0977524d354
|
||||
msgid "llama.cpp"
|
||||
msgstr "llama.cpp"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:5
|
||||
#: 56969ff863d949aa8df55d3bdb6957e7
|
||||
#: ebe3be273a42492d9832512554b4b7dc
|
||||
msgid ""
|
||||
"DB-GPT already supports "
|
||||
"[llama.cpp](https://github.com/ggerganov/llama.cpp) via [llama-cpp-"
|
||||
"python](https://github.com/abetlen/llama-cpp-python)."
|
||||
msgstr ""
|
||||
"DB-GPT已经通过[llama-cpp-python](https://github.com/abetlen/llama-cpp-"
|
||||
"python)支持[llama.cpp](https://github.com/ggerganov/llama.cpp)。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:7
|
||||
#: afe223eafcc641779e1580cac574c34a
|
||||
#: 97a4f6f95d6845258e3753803fc117a3
|
||||
msgid "Running llama.cpp"
|
||||
msgstr "运行 llama.cpp"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:9
|
||||
#: 0eaf98a036434eecb2af1fa89f045620
|
||||
#: 40fcdf93fe3d4542bbd84ed2d5a82623
|
||||
msgid "Preparing Model Files"
|
||||
msgstr "准备模型文件"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:11
|
||||
#: 4f45be5d9658451fb95f1d5d31dc8778
|
||||
#: f10bd034d24640d3b83572d50b2a9f71
|
||||
msgid ""
|
||||
"To use llama.cpp, you need to prepare a ggml format model file, and there"
|
||||
"To use llama.cpp, you need to prepare a gguf format model file, and there"
|
||||
" are two common ways to obtain it, you can choose either:"
|
||||
msgstr "使用llama.cpp, 你需要准备ggml格式的文件,你可以通过以下两种方法获取"
|
||||
msgstr "使用 llama.cpp,你需要准备 gguf 格式的文件,你可以通过以下两种方法获取"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:13
|
||||
#: 9934596e0f6e466aae63cefbb019e0ec
|
||||
#: fb143586b13849f0bb2b6ae0c9408e95
|
||||
msgid "Download a pre-converted model file."
|
||||
msgstr "Download a pre-converted model file."
|
||||
msgstr "下载已转换的模型文件"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:15
|
||||
#: 33fef76961064a5ca4c86c57111c8bd3
|
||||
#: a6e89c960ebd4778b8fc72d3d43e9543
|
||||
msgid ""
|
||||
"Suppose you want to use [Vicuna 7B v1.5](https://huggingface.co/lmsys"
|
||||
"/vicuna-7b-v1.5), you can download the file already converted from "
|
||||
"[TheBloke/vicuna-7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
|
||||
"7B-v1.5-GGML), only one file is needed. Download it to the `models` "
|
||||
"directory and rename it to `ggml-model-q4_0.bin`."
|
||||
"Suppose you want to use [Vicuna 13B v1.5](https://huggingface.co/lmsys"
|
||||
"/vicuna-13b-v1.5), you can download the file already converted from "
|
||||
"[TheBloke/vicuna-13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
|
||||
"13B-v1.5-GGUF), only one file is needed. Download it to the `models` "
|
||||
"directory and rename it to `ggml-model-q4_0.gguf`."
|
||||
msgstr ""
|
||||
"假设您想使用[Vicuna 7B v1.5](https://huggingface.co/lmsys/vicuna-"
|
||||
"7b-v1.5)您可以从[TheBloke/vicuna-"
|
||||
"7B-v1.5-GGML](https://huggingface.co/TheBloke/vicuna-"
|
||||
"7B-v1.5-GGML)下载已转换的文件,只需要一个文件。将其下载到models目录并将其重命名为ggml-model-q4_0.bin。"
|
||||
"假设您想使用[Vicuna 13B v1.5](https://huggingface.co/lmsys/vicuna-"
|
||||
"13b-v1.5)您可以从[TheBloke/vicuna-"
|
||||
"13B-v1.5-GGUF](https://huggingface.co/TheBloke/vicuna-"
|
||||
"13B-v1.5-GGUF)下载已转换的文件,只需要一个文件。将其下载到models目录并将其重命名为 `ggml-"
|
||||
"model-q4_0.gguf`。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:21
|
||||
#: 65fed5b7e95b4205b2b94596a21b6fe8
|
||||
#: 380ebad2c5a04210a48c5d7a9913413d
|
||||
msgid "Convert It Yourself"
|
||||
msgstr "Convert It Yourself"
|
||||
msgstr "自行转换"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:23
|
||||
#: 1421761d320046f79f725e64bd7d854c
|
||||
#: cf39ca73d9c6456794fb240b164b7cbb
|
||||
msgid ""
|
||||
"You can convert the model file yourself according to the instructions in "
|
||||
"[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
|
||||
"#prepare-data--run), and put the converted file in the models directory "
|
||||
"and rename it to `ggml-model-q4_0.bin`."
|
||||
"and rename it to `ggml-model-q4_0.gguf`."
|
||||
msgstr ""
|
||||
"您可以根据[llama.cpp#prepare-data--run](https://github.com/ggerganov/llama.cpp"
|
||||
"#prepare-data--run)中的说明自己转换模型文件,然后将转换后的文件放入models目录中,并将其重命名为ggml-"
|
||||
"model-q4_0.bin。"
|
||||
"#prepare-data--run)中的说明自行转换模型文件,并把转换后的文件放在models目录中,并重命名为`ggml-"
|
||||
"model-q4_0.gguf`。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:25
|
||||
#: 850b1f8ef6be49b192e01c1b7d8f1f26
|
||||
#: 363cbf1c0b4e4029982519238f776958
|
||||
msgid "Installing Dependencies"
|
||||
msgstr "安装依赖"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:27
|
||||
#: b323ee4799d745cc9c0a449bd37c371a
|
||||
#: a98c36e3d7df40f3a816c0ee451b6114
|
||||
msgid ""
|
||||
"llama.cpp is an optional dependency in DB-GPT, and you can manually "
|
||||
"install it using the following command:"
|
||||
msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过一下命令进行安装"
|
||||
msgstr "llama.cpp在DB-GPT中是可选安装项, 你可以通过以下命令进行安装"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:33
|
||||
#: 75b75c84ffb7476d8501a28bb2719615
|
||||
#: b0038a8ba36647c6a62eef907cb6d304
|
||||
msgid "Modifying the Configuration File"
|
||||
msgstr "修改配置文件"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:35
|
||||
#: d1f8b3e1ad3441f2aafbfe2519113c2c
|
||||
#: d2002da716744122a44ab4ed2e47e680
|
||||
msgid "Next, you can directly modify your `.env` file to enable llama.cpp."
|
||||
msgstr "修改`.env`文件使用llama.cpp"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:42
|
||||
#: 2ddcab3834f646e58a8b3316abf6ce3a
|
||||
#: 97a5fb5d4ed649f5aa0bbb97c32d54b0
|
||||
msgid ""
|
||||
"Then you can run it according to [Run](https://db-"
|
||||
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run)."
|
||||
msgstr ""
|
||||
"然后你可以通过[Run](https://db-"
|
||||
"gpt.readthedocs.io/en/latest/getting_started/install/deploy/deploy.html#run).来运行"
|
||||
"然后你可以根据[运行]"
|
||||
"(https://db-gpt.readthedocs.io/projects/db-gpt-docs-zh-cn/zh_CN/latest/getting_started/install/deploy/deploy.html#run)来运行"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:45
|
||||
#: bb9f222d22534827a9fa164b2126d192
|
||||
#: 0e3771b6aaa141f89c813507f3317bda
|
||||
msgid "More Configurations"
|
||||
msgstr "更多配置文件"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:47
|
||||
#: 14d016ad5bad451888d01e24f0ca86d9
|
||||
#: 0802ba524cd1458298fe6f90ae7f2da1
|
||||
msgid ""
|
||||
"In DB-GPT, the model configuration can be done through `{model "
|
||||
"name}_{config key}`."
|
||||
msgstr ""
|
||||
"In DB-GPT, the model configuration can be done through `{model "
|
||||
"name}_{config key}`."
|
||||
msgstr "在DB-GPT中,模型配置可以通过`{模型名称}_{配置名}` 来配置。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: a1bf4c1f49bd4d97ac45d4f3aff442c6
|
||||
#: d461d379a523424fb5885e393498ee14
|
||||
msgid "Environment Variable Key"
|
||||
msgstr "Environment Variable Key"
|
||||
msgstr "环境变量键"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 92692a38219c432fadffb8b3825ce678
|
||||
#: 0263477d0ddb4914baa0d3584b751086
|
||||
msgid "default"
|
||||
msgstr "default"
|
||||
msgstr "默认值"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 72b2d251aa2e4ca09c335b58e1a08de3
|
||||
msgid "Prompt Template Name"
|
||||
msgstr "Prompt Template Name"
|
||||
#: e5188d0ded6540a0bddb46d480f8b7ac
|
||||
msgid "Description"
|
||||
msgstr "描述"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 85a9f89eeb9a4b70b56913354e947329
|
||||
#: 213b27d0e53d4858b7576dc4f2ab4d7f
|
||||
msgid "llama_cpp_prompt_template"
|
||||
msgstr "llama_cpp_prompt_template"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 17e9750fbb824dfdaaed5415f6406e35 602016763bb2470d8a8ef700e576407b
|
||||
#: 790caafd5c4c4cecbb4c190745fb994c ceb6c41315ab4c5798ab3c64ee8693eb
|
||||
#: cfafab69a2684e27bd55aadfdd4c1575
|
||||
#: 1cb0320826564a89a3e2f51177f8a6ed 23d93dc7d88e431ba31ff64d239a412f
|
||||
#: 833d5012411a4ad58b04d50a40a29184 95aa2102191946919158ae668b2e3599
|
||||
#: becdd178292a48138dcb445ba3c2a6ec
|
||||
msgid "None"
|
||||
msgstr "None"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 5d02f2d1d5834b1e9e5d6982247fd6c9
|
||||
#: ac835806c79640aa8cd39edb11d7667c
|
||||
msgid ""
|
||||
"Prompt template name, now support: `zero_shot, vicuna_v1.1, llama-2"
|
||||
",baichuan-chat`, If None, the prompt template is automatically determined"
|
||||
" from model path。"
|
||||
"Prompt template name, now support: `zero_shot, vicuna_v1.1,alpaca,llama-2"
|
||||
",baichuan-chat,internlm-chat`, If None, the prompt template is "
|
||||
"automatically determined from model path。"
|
||||
msgstr ""
|
||||
"Prompt template 现在可以支持`zero_shot, vicuna_v1.1, llama-2,baichuan-chat`, "
|
||||
"如果是None, the prompt template可以自动选择模型路径"
|
||||
"Prompt template 现在可以支持`zero_shot, vicuna_v1.1,alpaca,llama-2,baichuan-"
|
||||
"chat,internlm-chat`, 如果是None, 可以根据模型路径来自动获取模型 Prompt template"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 2a95bc11386f45498b3585b194f24c17
|
||||
#: 41bce5a6bbf2417f8bc40e71c59405ad
|
||||
msgid "llama_cpp_model_path"
|
||||
msgstr "llama_cpp_model_path"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: c02db8a50e7a4df0acb6b75798a3ad4b
|
||||
#: 15df4d19645b40e7a209827f9a325b8f
|
||||
msgid "Model path"
|
||||
msgstr "Model path"
|
||||
msgstr "模型路径"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 6c92b2ec52634728bcc421670cdda70b
|
||||
#: caf9ddbfb787418d8b167746e3febe8c
|
||||
msgid "llama_cpp_n_gpu_layers"
|
||||
msgstr "llama_cpp_n_gpu_layers"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 9f1e1b763a0b40d28efd734fe20e1ba7
|
||||
#: e12e0ed2c01e4d12b41d5da533073c53
|
||||
msgid "1000000000"
|
||||
msgstr "1000000000"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 0f511b7907594c1f9c9818638764f209
|
||||
#: 1f4a868d3fed4ac78bfa48e13b3a59dc
|
||||
msgid ""
|
||||
"Number of layers to offload to the GPU, Set this to 1000000000 to offload"
|
||||
" all layers to the GPU. If your GPU VRAM is not enough, you can set a low"
|
||||
" number, eg: `10`"
|
||||
msgstr "要将层数转移到GPU上,将其设置为1000000000以将所有层转移到GPU上。如果您的GPU VRAM不足,可以设置较低的数字,例如:10。"
|
||||
msgstr "要将多少网络层转移到GPU上,将其设置为1000000000以将所有层转移到GPU上。如果您的 GPU 内存不足,可以设置较低的数字,例如:10。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 1ffdfa4eb78d4127b302b6d703852692
|
||||
#: 306e083489e24f819d67f38e2f155f0f
|
||||
msgid "llama_cpp_n_threads"
|
||||
msgstr "llama_cpp_n_threads"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: f14379e7ea16476da403d5085b67db1c
|
||||
#: 0490a543f67f4ecd8588541399846951
|
||||
msgid ""
|
||||
"Number of threads to use. If None, the number of threads is automatically"
|
||||
" determined"
|
||||
msgstr "要使用的线程数量。如果为None,则线程数量将自动确定。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 41cc1035f6e340e19848452d48a161db
|
||||
#: 2ad3f09e1f894e30ae512e1cd803af52
|
||||
msgid "llama_cpp_n_batch"
|
||||
msgstr "llama_cpp_n_batch"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 993c3b9218ee4299beae53bd75a01001
|
||||
#: c495776868394df5b311087dfc7c55dd
|
||||
msgid "512"
|
||||
msgstr "512"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 0e11d38c9b58478cacdade34de146320
|
||||
#: b5e69dc488cc4ae78ee9daefcf73c290
|
||||
msgid "Maximum number of prompt tokens to batch together when calling llama_eval"
|
||||
msgstr "在调用llama_eval时,批处理在一起的prompt tokens的最大数量"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 24f5381956d34569aabee4a5d832388b
|
||||
#: 516cfc3ed00c4a6181f37a4649c9f041
|
||||
msgid "llama_cpp_n_gqa"
|
||||
msgstr "llama_cpp_n_gqa"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 07d05844541c452caaa8d5bf56c3f8a1
|
||||
#: 51847a305c4341af8614a2ceb7aa658f
|
||||
msgid "Grouped-query attention. Must be 8 for llama-2 70b."
|
||||
msgstr "对于llama-2 70b模型,Grouped-query attention必须为8。"
|
||||
msgstr "对于 llama-2 70B 模型,Grouped-query attention 必须为8。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 40a1b9750d854bb19dc18b7d530beccf
|
||||
#: 8261108709f341dab19e4fece7682c0c
|
||||
msgid "llama_cpp_rms_norm_eps"
|
||||
msgstr "llama_cpp_rms_norm_eps"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 6018ee183b9548eabf91e9fc683e7c24
|
||||
#: 72cc3d9988414f489ddefe3afb332e83
|
||||
msgid "5e-06"
|
||||
msgstr "5e-06"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: eb273c6bcf2c4c47808024008ce230dc
|
||||
#: ebc1baebf57e4009b0fdfa68eb055d80
|
||||
msgid "5e-6 is a good value for llama-2 models."
|
||||
msgstr "对于llama-2模型来说,5e-6是一个不错的值。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: f70f3e935b764b6f9544d201ba2aaa05
|
||||
#: 0cc1199e293741f087c795230d9c8dda
|
||||
msgid "llama_cpp_cache_capacity"
|
||||
msgstr "llama_cpp_cache_capacity"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 70035ec5be244eda9fe93be3df2c66df
|
||||
#: 7d13612da75046b1a3fc0877e229bb91
|
||||
msgid "Maximum cache capacity. Examples: 2000MiB, 2GiB"
|
||||
msgstr "cache capacity最大值. Examples: 2000MiB, 2GiB"
|
||||
msgstr "模型缓存最大值. 例如: 2000MiB, 2GiB"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 164c31b005ae4979938d9bc67e7f2759
|
||||
#: 53332858d3a8472f8eb59d845c594ffd
|
||||
msgid "llama_cpp_prefer_cpu"
|
||||
msgstr "llama_cpp_prefer_cpu"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: 28f890f6bee3412e94aeb1326367326e
|
||||
#: 7ff31fe3233a4243840584bc069654cd
|
||||
msgid "False"
|
||||
msgstr "False"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md
|
||||
#: f8f27b6323384431ba064a720f39f997
|
||||
#: 62d1dbd4f8254141a697448a7a5f6701
|
||||
msgid ""
|
||||
"If a GPU is available, it will be preferred by default, unless "
|
||||
"prefer_cpu=False is configured."
|
||||
msgstr "如果有可用的GPU,默认情况下会优先使用GPU,除非配置了prefer_cpu=False。"
|
||||
msgstr "如果有可用的GPU,默认情况下会优先使用GPU,除非配置了 prefer_cpu=False。"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:61
|
||||
#: 0471e56c790047bab422aa47edad0a15
|
||||
#: 8de97de28d1a40c3b852a1268255ebed
|
||||
msgid "GPU Acceleration"
|
||||
msgstr "GPU 加速"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:63
|
||||
#: e95ad40d29004455bebeec8a1a7248c8
|
||||
#: 8bce74c0ddb5486190ff4d36fd5358be
|
||||
msgid ""
|
||||
"GPU acceleration is supported by default. If you encounter any issues, "
|
||||
"you can uninstall the dependent packages with the following command:"
|
||||
msgstr "默认情况下支持GPU加速。如果遇到任何问题,您可以使用以下命令卸载相关的依赖包"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:68
|
||||
#: c0caf1420e43437589693ddec96bd50f
|
||||
#: 1f3fe88521614d499cb1d046f8d3c125
|
||||
msgid ""
|
||||
"Then install `llama-cpp-python` according to the instructions in [llama-"
|
||||
"cpp-python](https://github.com/abetlen/llama-cpp-"
|
||||
@ -297,24 +298,24 @@ msgstr ""
|
||||
"python/blob/main/README.md).安装`llama-cpp-python`"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:71
|
||||
#: fe082f65b4e9416c97b18e5005bc0a59
|
||||
#: fc83106f0a0e4ddfb3c058bec62f4568
|
||||
msgid "Mac Usage"
|
||||
msgstr "Mac Usage"
|
||||
msgstr "Mac 使用"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:73
|
||||
#: 6f30d3fa399f434189fcb03d28a42d2d
|
||||
#: dcf5904a444342c8a768c4da8b777828
|
||||
msgid ""
|
||||
"Special attention, if you are using Apple Silicon (M1) Mac, it is highly "
|
||||
"recommended to install arm64 architecture python support, for example:"
|
||||
msgstr "特别注意:如果您正在使用苹果芯片(M1)的Mac电脑,强烈建议安装arm64架构的Python支持,例如:"
|
||||
msgstr "特别注意:如果您正在使用苹果芯片(M1)的Mac电脑,强烈建议安装 arm64 架构的 Python 支持,例如:"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:80
|
||||
#: 74602bede3c5472fbabc7de47eb2ff7a
|
||||
#: 547369c011a9412589dad1fac7ac3ef9
|
||||
msgid "Windows Usage"
|
||||
msgstr "Windows使用"
|
||||
|
||||
#: ../../getting_started/install/llm/llama/llama_cpp.md:82
|
||||
#: ae78332a348b44cb847723a998b98048
|
||||
#: 506fda57977f4aa8b9fe427e3c66f4d7
|
||||
msgid ""
|
||||
"The use under the Windows platform has not been rigorously tested and "
|
||||
"verified, and you are welcome to use it. If you have any problems, you "
|
||||
@ -323,8 +324,8 @@ msgid ""
|
||||
"information) directly."
|
||||
msgstr ""
|
||||
"在Windows平台上的使用尚未经过严格的测试和验证,欢迎您使用。如果您有任何问题,可以创建一个[issue](https://github.com"
|
||||
"/eosphoros-ai/DB-GPT/issues)或者[contact us](https://github.com/eosphoros-"
|
||||
"ai/DB-GPT/tree/main#contact-information) directly."
|
||||
"/eosphoros-ai/DB-GPT/issues)或者直接[联系我们](https://github.com/eosphoros-ai"
|
||||
"/DB-GPT/tree/main#cntact-information)。"
|
||||
|
||||
#~ msgid ""
|
||||
#~ "DB-GPT is now supported by "
|
||||
@ -337,3 +338,6 @@ msgstr ""
|
||||
#~ "cpp-python) through "
|
||||
#~ "[llama.cpp](https://github.com/ggerganov/llama.cpp)."
|
||||
|
||||
#~ msgid "Prompt Template Name"
|
||||
#~ msgstr "Prompt Template Name"
|
||||
|
||||
|
@ -70,7 +70,8 @@ LLM_MODEL_CONFIG = {
|
||||
"baichuan2-13b": os.path.join(MODEL_PATH, "Baichuan2-13B-Chat"),
|
||||
# (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2
|
||||
"wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"),
|
||||
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.bin"),
|
||||
# wget https://huggingface.co/TheBloke/vicuna-13B-v1.5-GGUF/resolve/main/vicuna-13b-v1.5.Q4_K_M.gguf -O models/ggml-model-q4_0.gguf
|
||||
"llama-cpp": os.path.join(MODEL_PATH, "ggml-model-q4_0.gguf"),
|
||||
# https://huggingface.co/internlm/internlm-chat-7b-v1_1, 7b vs 7b-v1.1: https://github.com/InternLM/InternLM/issues/288
|
||||
"internlm-7b": os.path.join(MODEL_PATH, "internlm-chat-7b"),
|
||||
"internlm-7b-8k": os.path.join(MODEL_PATH, "internlm-chat-7b-8k"),
|
||||
|
@ -382,14 +382,14 @@ class LlamaCppAdapater(BaseLLMAdaper):
|
||||
# Just support local model
|
||||
return False, None
|
||||
if not path.is_file():
|
||||
model_paths = list(path.glob("*ggml*.bin"))
|
||||
model_paths = list(path.glob("*ggml*.gguf"))
|
||||
if not model_paths:
|
||||
return False
|
||||
model_path = str(model_paths[0])
|
||||
logger.warn(
|
||||
f"Model path {model_path} is not single file, use first *gglm*.bin model file: {model_path}"
|
||||
f"Model path {model_path} is not single file, use first *gglm*.gguf model file: {model_path}"
|
||||
)
|
||||
if not re.fullmatch(".*ggml.*\.bin", model_path):
|
||||
if not re.fullmatch(".*ggml.*\.gguf", model_path):
|
||||
return False, None
|
||||
return True, model_path
|
||||
|
||||
|
@ -33,7 +33,7 @@ class WorkerManager(ABC):
|
||||
"""Start worker manager"""
|
||||
|
||||
@abstractmethod
|
||||
async def stop(self):
|
||||
async def stop(self, ignore_exception: bool = False):
|
||||
"""Stop worker manager"""
|
||||
|
||||
@abstractmethod
|
||||
|
@ -115,14 +115,30 @@ class LocalWorkerManager(WorkerManager):
|
||||
for listener in self.start_listeners:
|
||||
listener(self)
|
||||
|
||||
async def stop(self):
|
||||
async def stop(self, ignore_exception: bool = False):
|
||||
if not self.run_data.stop_event.is_set():
|
||||
logger.info("Stop all workers")
|
||||
self.run_data.stop_event.clear()
|
||||
stop_tasks = []
|
||||
stop_tasks.append(self._stop_all_worker(apply_req=None))
|
||||
stop_tasks.append(
|
||||
self._stop_all_worker(apply_req=None, ignore_exception=ignore_exception)
|
||||
)
|
||||
if self.deregister_func:
|
||||
stop_tasks.append(self.deregister_func(self.run_data))
|
||||
# If ignore_exception is True, use exception handling to ignore any exceptions raised from self.deregister_func
|
||||
if ignore_exception:
|
||||
|
||||
async def safe_deregister_func(run_data):
|
||||
try:
|
||||
await self.deregister_func(run_data)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Stop worker, ignored exception from deregister_func: {e}"
|
||||
)
|
||||
|
||||
stop_tasks.append(safe_deregister_func(self.run_data))
|
||||
else:
|
||||
stop_tasks.append(self.deregister_func(self.run_data))
|
||||
|
||||
await asyncio.gather(*stop_tasks)
|
||||
|
||||
def after_start(self, listener: Callable[["WorkerManager"], None]):
|
||||
@ -424,7 +440,7 @@ class LocalWorkerManager(WorkerManager):
|
||||
)
|
||||
|
||||
async def _stop_all_worker(
|
||||
self, apply_req: WorkerApplyRequest
|
||||
self, apply_req: WorkerApplyRequest, ignore_exception: bool = False
|
||||
) -> WorkerApplyOutput:
|
||||
start_time = time.time()
|
||||
|
||||
@ -441,7 +457,19 @@ class LocalWorkerManager(WorkerManager):
|
||||
and self.register_func
|
||||
and self.deregister_func
|
||||
):
|
||||
await self.deregister_func(worker_run_data)
|
||||
_deregister_func = self.deregister_func
|
||||
if ignore_exception:
|
||||
|
||||
async def safe_deregister_func(run_data):
|
||||
try:
|
||||
await self.deregister_func(run_data)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Stop worker, ignored exception from deregister_func: {e}"
|
||||
)
|
||||
|
||||
_deregister_func = safe_deregister_func
|
||||
await _deregister_func(worker_run_data)
|
||||
|
||||
await self._apply_worker(apply_req, _stop_worker)
|
||||
timecost = time.time() - start_time
|
||||
@ -487,8 +515,8 @@ class WorkerManagerAdapter(WorkerManager):
|
||||
async def start(self):
|
||||
return await self.worker_manager.start()
|
||||
|
||||
async def stop(self):
|
||||
return await self.worker_manager.stop()
|
||||
async def stop(self, ignore_exception: bool = False):
|
||||
return await self.worker_manager.stop(ignore_exception=ignore_exception)
|
||||
|
||||
def after_start(self, listener: Callable[["WorkerManager"], None]):
|
||||
if listener is not None:
|
||||
@ -631,7 +659,9 @@ async def api_model_shutdown(request: WorkerStartupRequest):
|
||||
return await worker_manager.model_shutdown(request)
|
||||
|
||||
|
||||
def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
|
||||
def _setup_fastapi(
|
||||
worker_params: ModelWorkerParameters, app=None, ignore_exception: bool = False
|
||||
):
|
||||
if not app:
|
||||
app = FastAPI()
|
||||
if worker_params.standalone:
|
||||
@ -666,7 +696,7 @@ def _setup_fastapi(worker_params: ModelWorkerParameters, app=None):
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def startup_event():
|
||||
await worker_manager.stop()
|
||||
await worker_manager.stop(ignore_exception=ignore_exception)
|
||||
|
||||
return app
|
||||
|
||||
@ -837,7 +867,7 @@ def initialize_worker_manager_in_client(
|
||||
worker_params.register = True
|
||||
worker_params.port = local_port
|
||||
logger.info(f"Worker params: {worker_params}")
|
||||
_setup_fastapi(worker_params, app)
|
||||
_setup_fastapi(worker_params, app, ignore_exception=True)
|
||||
_start_local_worker(worker_manager, worker_params)
|
||||
worker_manager.after_start(start_listener)
|
||||
_start_local_embedding_worker(
|
||||
|
@ -17,7 +17,7 @@ class RemoteWorkerManager(LocalWorkerManager):
|
||||
for listener in self.start_listeners:
|
||||
listener(self)
|
||||
|
||||
async def stop(self):
|
||||
async def stop(self, ignore_exception: bool = False):
|
||||
pass
|
||||
|
||||
async def _fetch_from_worker(
|
||||
|
@ -44,11 +44,6 @@ static_file_path = os.path.join(os.getcwd(), "server/static")
|
||||
CFG = Config()
|
||||
|
||||
|
||||
def signal_handler():
|
||||
print("in order to avoid chroma db atexit problem")
|
||||
os._exit(0)
|
||||
|
||||
|
||||
def swagger_monkey_patch(*args, **kwargs):
|
||||
return get_swagger_ui_html(
|
||||
*args,
|
||||
@ -176,7 +171,6 @@ def run_uvicorn(param: WebWerverParameters):
|
||||
port=param.port,
|
||||
log_level=logging_str_to_uvicorn_level(param.log_level),
|
||||
)
|
||||
signal.signal(signal.SIGINT, signal_handler())
|
||||
|
||||
|
||||
def run_webserver(param: WebWerverParameters = None):
|
||||
|
26
setup.py
26
setup.py
@ -14,7 +14,10 @@ from setuptools import find_packages
|
||||
with open("README.md", mode="r", encoding="utf-8") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "false").lower() == "true"
|
||||
BUILD_NO_CACHE = os.getenv("BUILD_NO_CACHE", "true").lower() == "true"
|
||||
LLAMA_CPP_GPU_ACCELERATION = (
|
||||
os.getenv("LLAMA_CPP_GPU_ACCELERATION", "true").lower() == "true"
|
||||
)
|
||||
|
||||
|
||||
def parse_requirements(file_name: str) -> List[str]:
|
||||
@ -249,21 +252,29 @@ def llama_cpp_python_cuda_requires():
|
||||
if not cuda_version:
|
||||
print("CUDA not support, use cpu version")
|
||||
return
|
||||
if not LLAMA_CPP_GPU_ACCELERATION:
|
||||
print("Disable GPU acceleration")
|
||||
return
|
||||
# Supports GPU acceleration
|
||||
device = "cu" + cuda_version.replace(".", "")
|
||||
os_type, cpu_avx = get_cpu_avx_support()
|
||||
print(f"OS: {os_type}, cpu avx: {cpu_avx}")
|
||||
supported_os = [OSType.WINDOWS, OSType.LINUX]
|
||||
if os_type not in supported_os:
|
||||
print(
|
||||
f"llama_cpp_python_cuda just support in os: {[r._value_ for r in supported_os]}"
|
||||
)
|
||||
return
|
||||
if cpu_avx == AVXType.AVX2 or AVXType.AVX512:
|
||||
cpu_avx = AVXType.AVX
|
||||
cpu_avx = cpu_avx._value_
|
||||
cpu_device = ""
|
||||
if cpu_avx == AVXType.AVX2 or cpu_avx == AVXType.AVX512:
|
||||
cpu_device = "avx"
|
||||
else:
|
||||
cpu_device = "basic"
|
||||
device += cpu_device
|
||||
base_url = "https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui"
|
||||
llama_cpp_version = "0.1.77"
|
||||
llama_cpp_version = "0.2.10"
|
||||
py_version = "cp310"
|
||||
os_pkg_name = "linux_x86_64" if os_type == OSType.LINUX else "win_amd64"
|
||||
os_pkg_name = "manylinux_2_31_x86_64" if os_type == OSType.LINUX else "win_amd64"
|
||||
extra_index_url = f"{base_url}/llama_cpp_python_cuda-{llama_cpp_version}+{device}-{py_version}-{py_version}-{os_pkg_name}.whl"
|
||||
extra_index_url, _ = encode_url(extra_index_url)
|
||||
print(f"Install llama_cpp_python_cuda from {extra_index_url}")
|
||||
@ -298,7 +309,7 @@ def core_requires():
|
||||
"langchain>=0.0.286",
|
||||
"SQLAlchemy",
|
||||
"pymysql",
|
||||
"duckdb",
|
||||
"duckdb==0.8.1",
|
||||
"duckdb-engine",
|
||||
"jsonschema",
|
||||
# TODO move transformers to default
|
||||
@ -312,7 +323,6 @@ def knowledge_requires():
|
||||
"""
|
||||
setup_spec.extras["knowledge"] = [
|
||||
"spacy==3.5.3",
|
||||
# "chromadb==0.3.22",
|
||||
"chromadb==0.4.10",
|
||||
"markdown",
|
||||
"bs4",
|
||||
|
Loading…
Reference in New Issue
Block a user