From a4574aa614cbf2fe169e9eecd24520d4f3b6c699 Mon Sep 17 00:00:00 2001 From: FangYin Cheng Date: Thu, 3 Aug 2023 14:13:50 +0800 Subject: [PATCH] feat: Support vicuna-v1.5 and WizardLM-v1.2 --- README.md | 2 + README.zh.md | 15 ++++--- docker/build_all_images.sh | 7 +++- docs/getting_started/getting_started.md | 39 +++++++++++++++++- pilot/configs/config.py | 4 +- pilot/configs/model_config.py | 5 +++ pilot/model/adapter.py | 6 +++ pilot/model/conversation.py | 15 +++++++ pilot/model/loader.py | 27 ++++++++----- pilot/server/chat_adapter.py | 54 +++++++++++++++---------- pilot/server/llmserver.py | 15 +++---- 11 files changed, 140 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index d0e5e142f..63b29f0aa 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,9 @@ Currently, we have released multiple key features, which are listed below to dem - Support for unstructured data such as PDF, TXT, Markdown, CSV, DOC, PPT, and WebURL - Multi LLMs Support, Supports multiple large language models, currently supporting + - 🔥 Vicuna-v1.5(7b,13b) - 🔥 llama-2(7b,13b,70b) + - WizardLM-v1.2(13b) - Vicuna (7b,13b) - ChatGLM-6b (int4,int8) - ChatGLM2-6b (int4,int8) diff --git a/README.zh.md b/README.zh.md index 043280ae3..becffc81c 100644 --- a/README.zh.md +++ b/README.zh.md @@ -112,12 +112,15 @@ https://github.com/csunny/DB-GPT/assets/13723926/55f31781-1d49-4757-b96e-7ef6d3d - 多模型支持 - 支持多种大语言模型, 当前已支持如下模型: - - Vicuna(7b,13b) - - ChatGLM-6b(int4,int8) - - guanaco(7b,13b,33b) - - Gorilla(7b,13b) - - 🔥 llama-2(7b,13b,70b) - - baichuan(7b,13b) + - 🔥 Vicuna-v1.5(7b,13b) + - 🔥 llama-2(7b,13b,70b) + - WizardLM-v1.2(13b) + - Vicuna (7b,13b) + - ChatGLM-6b (int4,int8) + - ChatGLM2-6b (int4,int8) + - guanaco(7b,13b,33b) + - Gorilla(7b,13b) + - baichuan(7b,13b) ## 架构方案 DB-GPT基于 [FastChat](https://github.com/lm-sys/FastChat) 构建大模型运行环境,并提供 vicuna 作为基础的大语言模型。此外,我们通过LangChain提供私域知识库问答能力。同时我们支持插件模式, 在设计上原生支持Auto-GPT插件。我们的愿景是让围绕数据库和LLM构建应用程序更加简便和便捷。 diff --git a/docker/build_all_images.sh b/docker/build_all_images.sh index ec307f1fe..012d41df1 100755 --- a/docker/build_all_images.sh +++ b/docker/build_all_images.sh @@ -4,10 +4,15 @@ SCRIPT_LOCATION=$0 cd "$(dirname "$SCRIPT_LOCATION")" WORK_DIR=$(pwd) +if [[ " $* " == *" --help "* ]] || [[ " $* " == *" -h "* ]]; then + bash $WORK_DIR/base/build_image.sh "$@" + exit 0 +fi + bash $WORK_DIR/base/build_image.sh "$@" if [ 0 -ne $? ]; then - ehco "Error: build base image failed" + echo "Error: build base image failed" exit 1 fi diff --git a/docs/getting_started/getting_started.md b/docs/getting_started/getting_started.md index a151871b9..0b3df23f3 100644 --- a/docs/getting_started/getting_started.md +++ b/docs/getting_started/getting_started.md @@ -48,6 +48,7 @@ Notice make sure you have install git-lfs ``` ```bash +git clone https://huggingface.co/lmsys/vicuna-13b-v1.5 git clone https://huggingface.co/Tribbiani/vicuna-13b git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 git clone https://huggingface.co/GanymedeNil/text2vec-large-chinese @@ -62,6 +63,8 @@ cp .env.template .env You can configure basic parameters in the .env file, for example setting LLM_MODEL to the model to be used +([Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) based on llama-2 has been released, we recommend you set `LLM_MODEL=vicuna-13b-v1.5` to try this model) + ### 3. Run You can refer to this document to obtain the Vicuna weights: [Vicuna](https://github.com/lm-sys/FastChat/blob/main/README.md#model-weights) . @@ -107,6 +110,16 @@ db-gpt-allinone latest e1ffd20b85ac 45 minutes ago 14.5GB db-gpt latest e36fb0cca5d9 3 hours ago 14GB ``` +You can pass some parameters to docker/build_all_images.sh. +```bash +$ bash docker/build_all_images.sh \ +--base-image nvidia/cuda:11.8.0-devel-ubuntu22.04 \ +--pip-index-url https://pypi.tuna.tsinghua.edu.cn/simple \ +--language zh +``` + +You can execute the command `bash docker/build_all_images.sh --help` to see more usage. + #### 4.2. Run all in one docker container **Run with local model** @@ -158,7 +171,7 @@ $ docker run --gpus "device=0" -d -p 3306:3306 \ - `-e LLM_MODEL=proxyllm`, means we use proxy llm(openai interface, fastchat interface...) - `-v /data/models/text2vec-large-chinese:/app/models/text2vec-large-chinese`, means we mount the local text2vec model to the docker container. -#### 4.2. Run with docker compose +#### 4.3. Run with docker compose ```bash $ docker compose up -d @@ -197,6 +210,8 @@ CUDA_VISIBLE_DEVICES=0 python3 pilot/server/dbgpt_server.py CUDA_VISIBLE_DEVICES=3,4,5,6 python3 pilot/server/dbgpt_server.py ```` +You can modify the setting `MAX_GPU_MEMORY=xxGib` in `.env` file to configure the maximum memory used by each GPU. + ### 6. Not Enough Memory DB-GPT supported 8-bit quantization and 4-bit quantization. @@ -205,4 +220,24 @@ You can modify the setting `QUANTIZE_8bit=True` or `QUANTIZE_4bit=True` in `.env Llama-2-70b with 8-bit quantization can run with 80 GB of VRAM, and 4-bit quantization can run with 48 GB of VRAM. -Note: you need to install the latest dependencies according to [requirements.txt](https://github.com/eosphoros-ai/DB-GPT/blob/main/requirements.txt). \ No newline at end of file +Note: you need to install the latest dependencies according to [requirements.txt](https://github.com/eosphoros-ai/DB-GPT/blob/main/requirements.txt). + + +Here are some of the VRAM size usage of the models we tested in some common scenarios. + +| Model | Quantize | VRAM Size | +| --------- | --------- | --------- | +| vicuna-7b-v1.5 | 4-bit | 8 GB | +| vicuna-7b-v1.5 | 8-bit | 12 GB | +| vicuna-13b-v1.5 | 4-bit | 12 GB | +| vicuna-13b-v1.5 | 8-bit | 20 GB | +| llama-2-7b | 4-bit | 8 GB | +| llama-2-7b | 8-bit | 12 GB | +| llama-2-13b | 4-bit | 12 GB | +| llama-2-13b | 8-bit | 20 GB | +| llama-2-70b | 4-bit | 48 GB | +| llama-2-70b | 8-bit | 80 GB | +| baichuan-7b | 4-bit | 8 GB | +| baichuan-7b | 8-bit | 12 GB | +| baichuan-13b | 4-bit | 12 GB | +| baichuan-13b | 8-bit | 20 GB | \ No newline at end of file diff --git a/pilot/configs/config.py b/pilot/configs/config.py index 3bd8335ff..4d695da7d 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -155,8 +155,8 @@ class Config(metaclass=Singleton): # QLoRA self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") - self.IS_LOAD_8BIT = bool(os.getenv("QUANTIZE_8bit", "True")) - self.IS_LOAD_4BIT = bool(os.getenv("QUANTIZE_4bit", "False")) + self.IS_LOAD_8BIT = os.getenv("QUANTIZE_8bit", "True") == "True" + self.IS_LOAD_4BIT = os.getenv("QUANTIZE_4bit", "False") == "True" if self.IS_LOAD_8BIT and self.IS_LOAD_4BIT: self.IS_LOAD_8BIT = False diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index 7b901e89a..9ca56824a 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -33,6 +33,9 @@ LLM_MODEL_CONFIG = { "flan-t5-base": os.path.join(MODEL_PATH, "flan-t5-base"), "vicuna-13b": os.path.join(MODEL_PATH, "vicuna-13b"), "vicuna-7b": os.path.join(MODEL_PATH, "vicuna-7b"), + # (Llama2 based) see https://huggingface.co/lmsys/vicuna-13b-v1.5 + "vicuna-13b-v1.5": os.path.join(MODEL_PATH, "vicuna-13b-v1.5"), + "vicuna-7b-v1.5": os.path.join(MODEL_PATH, "vicuna-7b-v1.5"), "text2vec": os.path.join(MODEL_PATH, "text2vec-large-chinese"), "sentence-transforms": os.path.join(MODEL_PATH, "all-MiniLM-L6-v2"), "codegen2-1b": os.path.join(MODEL_PATH, "codegen2-1B"), @@ -59,6 +62,8 @@ LLM_MODEL_CONFIG = { "baichuan-13b": os.path.join(MODEL_PATH, "Baichuan-13B-Chat"), # please rename "fireballoon/baichuan-vicuna-chinese-7b" to "baichuan-7b" "baichuan-7b": os.path.join(MODEL_PATH, "baichuan-7b"), + # (Llama2 based) We only support WizardLM-13B-V1.2 for now, which is trained from Llama-2 13b, see https://huggingface.co/WizardLM/WizardLM-13B-V1.2 + "wizardlm-13b": os.path.join(MODEL_PATH, "WizardLM-13B-V1.2"), } # Load model config diff --git a/pilot/model/adapter.py b/pilot/model/adapter.py index 73e2682e2..1c8562e78 100644 --- a/pilot/model/adapter.py +++ b/pilot/model/adapter.py @@ -291,6 +291,11 @@ class BaichuanAdapter(BaseLLMAdaper): return model, tokenizer +class WizardLMAdapter(BaseLLMAdaper): + def match(self, model_path: str): + return "wizardlm" in model_path.lower() + + register_llm_model_adapters(VicunaLLMAdapater) register_llm_model_adapters(ChatGLMAdapater) register_llm_model_adapters(GuanacoAdapter) @@ -299,6 +304,7 @@ register_llm_model_adapters(GorillaAdapter) register_llm_model_adapters(GPT4AllAdapter) register_llm_model_adapters(Llama2Adapter) register_llm_model_adapters(BaichuanAdapter) +register_llm_model_adapters(WizardLMAdapter) # TODO Default support vicuna, other model need to tests and Evaluate # just for test_py, remove this later diff --git a/pilot/model/conversation.py b/pilot/model/conversation.py index 11ca03ed8..5ee54cfc1 100644 --- a/pilot/model/conversation.py +++ b/pilot/model/conversation.py @@ -299,6 +299,21 @@ register_conv_template( ) ) +# Vicuna v1.1 template +register_conv_template( + Conversation( + name="vicuna_v1.1", + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + messages=(), + offset=0, + sep_style=SeparatorStyle.ADD_COLON_TWO, + sep=" ", + sep2="", + ) +) + # llama2 template # reference: https://github.com/facebookresearch/llama/blob/cfc3fc8c1968d390eb830e65c63865e980873a06/llama/generation.py#L212 register_conv_template( diff --git a/pilot/model/loader.py b/pilot/model/loader.py index d56ef6a47..7d33d8c6a 100644 --- a/pilot/model/loader.py +++ b/pilot/model/loader.py @@ -44,6 +44,8 @@ class ModelParams: def _check_multi_gpu_or_4bit_quantization(model_params: ModelParams): + # TODO: vicuna-v1.5 8-bit quantization info is slow + # TODO: support wizardlm quantization, see: https://huggingface.co/WizardLM/WizardLM-13B-V1.2/discussions/5 model_name = model_params.model_name.lower() supported_models = ["llama", "baichuan", "vicuna"] return any(m in model_name for m in supported_models) @@ -89,7 +91,6 @@ class ModelLoader(metaclass=Singleton): # TODO multi gpu support def loader( self, - num_gpus, load_8bit=False, load_4bit=False, debug=False, @@ -100,14 +101,13 @@ class ModelLoader(metaclass=Singleton): device=self.device, model_path=self.model_path, model_name=self.model_name, - num_gpus=num_gpus, max_gpu_memory=max_gpu_memory, cpu_offloading=cpu_offloading, load_8bit=load_8bit, load_4bit=load_4bit, debug=debug, ) - + logger.info(f"model_params:\n{model_params}") llm_adapter = get_llm_model_adapter(model_params.model_path) return huggingface_loader(llm_adapter, model_params) @@ -126,13 +126,14 @@ def huggingface_loader(llm_adapter: BaseLLMAdaper, model_params: ModelParams): } if num_gpus != 1: kwargs["device_map"] = "auto" - kwargs["max_memory"] = max_memory - elif model_params.max_gpu_memory: - logger.info( - f"There has max_gpu_memory from config: {model_params.max_gpu_memory}" - ) - max_memory = {i: model_params.max_gpu_memory for i in range(num_gpus)} - kwargs["max_memory"] = max_memory + if model_params.max_gpu_memory: + logger.info( + f"There has max_gpu_memory from config: {model_params.max_gpu_memory}" + ) + max_memory = {i: model_params.max_gpu_memory for i in range(num_gpus)} + kwargs["max_memory"] = max_memory + else: + kwargs["max_memory"] = max_memory logger.debug(f"max_memory: {max_memory}") elif device == "mps": @@ -282,6 +283,9 @@ def load_huggingface_quantization_model( # Loading the tokenizer if type(model) is LlamaForCausalLM: + logger.info( + f"Current model is type of: LlamaForCausalLM, load tokenizer by LlamaTokenizer" + ) tokenizer = LlamaTokenizer.from_pretrained( model_params.model_path, clean_up_tokenization_spaces=True ) @@ -294,6 +298,9 @@ def load_huggingface_quantization_model( except Exception as e: logger.warn(f"{str(e)}") else: + logger.info( + f"Current model type is not LlamaForCausalLM, load tokenizer by AutoTokenizer" + ) tokenizer = AutoTokenizer.from_pretrained( model_params.model_path, trust_remote_code=model_params.trust_remote_code, diff --git a/pilot/server/chat_adapter.py b/pilot/server/chat_adapter.py index d47bc6cc8..ab3aec94e 100644 --- a/pilot/server/chat_adapter.py +++ b/pilot/server/chat_adapter.py @@ -15,9 +15,11 @@ class BaseChatAdpter: def match(self, model_path: str): return True - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): """Return the generate stream handler func""" - pass + from pilot.model.inference import generate_stream + + return generate_stream def get_conv_template(self, model_path: str) -> Conversation: return None @@ -105,10 +107,21 @@ def get_llm_chat_adapter(model_path: str) -> BaseChatAdpter: class VicunaChatAdapter(BaseChatAdpter): """Model chat Adapter for vicuna""" - def match(self, model_path: str): - return "vicuna" in model_path + def _is_llama2_based(self, model_path: str): + # see https://huggingface.co/lmsys/vicuna-13b-v1.5 + return "v1.5" in model_path.lower() - def get_generate_stream_func(self): + def match(self, model_path: str): + return "vicuna" in model_path.lower() + + def get_conv_template(self, model_path: str) -> Conversation: + if self._is_llama2_based(model_path): + return get_conv_template("vicuna_v1.1") + return None + + def get_generate_stream_func(self, model_path: str): + if self._is_llama2_based(model_path): + return super().get_generate_stream_func(model_path) return generate_stream @@ -118,7 +131,7 @@ class ChatGLMChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "chatglm" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.chatglm_llm import chatglm_generate_stream return chatglm_generate_stream @@ -130,7 +143,7 @@ class CodeT5ChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "codet5" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): # TODO pass @@ -141,7 +154,7 @@ class CodeGenChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "codegen" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): # TODO pass @@ -152,7 +165,7 @@ class GuanacoChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "guanaco" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.guanaco_llm import guanaco_generate_stream return guanaco_generate_stream @@ -164,7 +177,7 @@ class FalconChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "falcon" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.falcon_llm import falcon_generate_output return falcon_generate_output @@ -174,7 +187,7 @@ class ProxyllmChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "proxyllm" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.proxy_llm import proxyllm_generate_stream return proxyllm_generate_stream @@ -184,7 +197,7 @@ class GorillaChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "gorilla" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.gorilla_llm import generate_stream return generate_stream @@ -194,7 +207,7 @@ class GPT4AllChatAdapter(BaseChatAdpter): def match(self, model_path: str): return "gpt4all" in model_path - def get_generate_stream_func(self): + def get_generate_stream_func(self, model_path: str): from pilot.model.llm_out.gpt4all_llm import gpt4all_generate_stream return gpt4all_generate_stream @@ -207,11 +220,6 @@ class Llama2ChatAdapter(BaseChatAdpter): def get_conv_template(self, model_path: str) -> Conversation: return get_conv_template("llama-2") - def get_generate_stream_func(self): - from pilot.model.inference import generate_stream - - return generate_stream - class BaichuanChatAdapter(BaseChatAdpter): def match(self, model_path: str): @@ -222,10 +230,13 @@ class BaichuanChatAdapter(BaseChatAdpter): return get_conv_template("baichuan-chat") return get_conv_template("zero_shot") - def get_generate_stream_func(self): - from pilot.model.inference import generate_stream - return generate_stream +class WizardLMChatAdapter(BaseChatAdpter): + def match(self, model_path: str): + return "wizardlm" in model_path.lower() + + def get_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("vicuna_v1.1") register_llm_model_chat_adapter(VicunaChatAdapter) @@ -236,6 +247,7 @@ register_llm_model_chat_adapter(GorillaChatAdapter) register_llm_model_chat_adapter(GPT4AllChatAdapter) register_llm_model_chat_adapter(Llama2ChatAdapter) register_llm_model_chat_adapter(BaichuanChatAdapter) +register_llm_model_chat_adapter(WizardLMChatAdapter) # Proxy model for test and develop, it's cheap for us now. register_llm_model_chat_adapter(ProxyllmChatAdapter) diff --git a/pilot/server/llmserver.py b/pilot/server/llmserver.py index 6ffdd616f..21789b9c8 100644 --- a/pilot/server/llmserver.py +++ b/pilot/server/llmserver.py @@ -31,15 +31,16 @@ CFG = Config() class ModelWorker: - def __init__(self, model_path, model_name, device, num_gpus=1): + def __init__(self, model_path, model_name, device): if model_path.endswith("/"): model_path = model_path[:-1] self.model_name = model_name or model_path.split("/")[-1] self.device = device print(f"Loading {model_name} LLM ModelServer in {device}! Please Wait......") - self.ml = ModelLoader(model_path=model_path, model_name=self.model_name) + self.ml: ModelLoader = ModelLoader( + model_path=model_path, model_name=self.model_name + ) self.model, self.tokenizer = self.ml.loader( - num_gpus, load_8bit=CFG.IS_LOAD_8BIT, load_4bit=CFG.IS_LOAD_4BIT, debug=ISDEBUG, @@ -60,7 +61,9 @@ class ModelWorker: self.context_len = 2048 self.llm_chat_adapter = get_llm_chat_adapter(model_path) - self.generate_stream_func = self.llm_chat_adapter.get_generate_stream_func() + self.generate_stream_func = self.llm_chat_adapter.get_generate_stream_func( + model_path + ) def start_check(self): print("LLM Model Loading Success!") @@ -111,9 +114,7 @@ class ModelWorker: model_path = LLM_MODEL_CONFIG[CFG.LLM_MODEL] -worker = ModelWorker( - model_path=model_path, model_name=CFG.LLM_MODEL, device=DEVICE, num_gpus=1 -) +worker = ModelWorker(model_path=model_path, model_name=CFG.LLM_MODEL, device=DEVICE) app = FastAPI() # from pilot.openapi.knowledge.knowledge_controller import router