diff --git a/.env.template b/.env.template index eefae44ac..f1418323d 100644 --- a/.env.template +++ b/.env.template @@ -27,6 +27,8 @@ MODEL_SERVER=http://127.0.0.1:8000 LIMIT_MODEL_CONCURRENCY=5 MAX_POSITION_EMBEDDINGS=4096 QUANTIZE_QLORA=True +QUANTIZE_8bit=True +# QUANTIZE_4bit=False ## SMART_LLM_MODEL - Smart language model (Default: vicuna-13b) ## FAST_LLM_MODEL - Fast language model (Default: chatglm-6b) # SMART_LLM_MODEL=vicuna-13b @@ -125,11 +127,15 @@ PROXY_SERVER_URL=https://api.openai.com/v1/chat/completions BARD_PROXY_API_KEY={your-bard-token} #*******************************************************************# -# ** SUMMARY_CONFIG +#** SUMMARY_CONFIG **# #*******************************************************************# SUMMARY_CONFIG=FAST #*******************************************************************# -# ** MUlti-GPU +#** MUlti-GPU **# #*******************************************************************# -NUM_GPUS = 1 +## See https://developer.nvidia.com/blog/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/ +## If CUDA_VISIBLE_DEVICES is not configured, all available gpus will be used +# CUDA_VISIBLE_DEVICES=0 +## You can configure the maximum memory used by each GPU. +# MAX_GPU_MEMORY=16Gib diff --git a/docs/getting_started/getting_started.md b/docs/getting_started/getting_started.md index e2a7c307e..a151871b9 100644 --- a/docs/getting_started/getting_started.md +++ b/docs/getting_started/getting_started.md @@ -80,26 +80,11 @@ Open http://localhost:5000 with your browser to see the product. If you want to access an external LLM service, you need to 1.set the variables LLM_MODEL=YOUR_MODEL_NAME MODEL_SERVER=YOUR_MODEL_SERVER(eg:http://localhost:5000) in the .env file. 2.execute dbgpt_server.py in light mode +If you want to learn about dbgpt-webui, read https://github./csunny/DB-GPT/tree/new-page-framework/datacenter + ```bash $ python pilot/server/dbgpt_server.py --light ``` -#### 3.1 Steps for Starting ChatGLM-6B and ChatGLM2-6B with Multiple Cards - -Modify the. env.template or pilot/configurations/config.py file NUM_ Number of GPUS (quantity is the actual number of graphics cards required for startup) - -At the same time, it is necessary to specify the required gpu card ID before starting the command (note that the number of gpu cards specified is consistent with the number of NUM_GPUS), as shown below: - -````shell -# Specify 1 gpu card -NUM_GPUS = 1 -CUDA_VISIBLE_DEVICES=0 python3 pilot/server/dbgpt_server.py - -# Specify 4 gpus card -NUM_GPUS = 4 -CUDA_VISIBLE_DEVICES=3,4,5,6 python3 pilot/server/dbgpt_server.py -```` - -If you want to learn about dbgpt-webui, read https://github.com/csunny/DB-GPT/tree/new-page-framework/datacenter ### 4. Docker (Experimental) @@ -196,3 +181,28 @@ $ docker logs db-gpt-webserver-1 -f Open http://localhost:5000 with your browser to see the product. You can open docker-compose.yml in the project root directory to see more details. + + +### 5. Multiple GPUs + +DB-GPT will use all available gpu by default. And you can modify the setting `CUDA_VISIBLE_DEVICES=0,1` in `.env` file to use the specific gpu IDs. + +Optionally, you can also specify the gpu ID to use before the starting command, as shown below: + +````shell +# Specify 1 gpu +CUDA_VISIBLE_DEVICES=0 python3 pilot/server/dbgpt_server.py + +# Specify 4 gpus +CUDA_VISIBLE_DEVICES=3,4,5,6 python3 pilot/server/dbgpt_server.py +```` + +### 6. Not Enough Memory + +DB-GPT supported 8-bit quantization and 4-bit quantization. + +You can modify the setting `QUANTIZE_8bit=True` or `QUANTIZE_4bit=True` in `.env` file to use quantization(8-bit quantization is enabled by default). + +Llama-2-70b with 8-bit quantization can run with 80 GB of VRAM, and 4-bit quantization can run with 48 GB of VRAM. + +Note: you need to install the latest dependencies according to [requirements.txt](https://github.com/eosphoros-ai/DB-GPT/blob/main/requirements.txt). \ No newline at end of file diff --git a/pilot/configs/config.py b/pilot/configs/config.py index f48f5e257..3bd8335ff 100644 --- a/pilot/configs/config.py +++ b/pilot/configs/config.py @@ -29,7 +29,7 @@ class Config(metaclass=Singleton): self.skip_reprompt = False self.temperature = float(os.getenv("TEMPERATURE", 0.7)) - self.NUM_GPUS = int(os.getenv("NUM_GPUS", 1)) + # self.NUM_GPUS = int(os.getenv("NUM_GPUS", 1)) self.execute_local_commands = ( os.getenv("EXECUTE_LOCAL_COMMANDS", "False") == "True" @@ -145,7 +145,6 @@ class Config(metaclass=Singleton): self.MODEL_SERVER = os.getenv( "MODEL_SERVER", "http://127.0.0.1" + ":" + str(self.MODEL_PORT) ) - self.ISLOAD_8BIT = os.getenv("ISLOAD_8BIT", "True") == "True" ### Vector Store Configuration self.VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "Chroma") @@ -156,6 +155,10 @@ class Config(metaclass=Singleton): # QLoRA self.QLoRA = os.getenv("QUANTIZE_QLORA", "True") + self.IS_LOAD_8BIT = bool(os.getenv("QUANTIZE_8bit", "True")) + self.IS_LOAD_4BIT = bool(os.getenv("QUANTIZE_4bit", "False")) + if self.IS_LOAD_8BIT and self.IS_LOAD_4BIT: + self.IS_LOAD_8BIT = False ### EMBEDDING Configuration self.EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text2vec") @@ -164,6 +167,8 @@ class Config(metaclass=Singleton): ### SUMMARY_CONFIG Configuration self.SUMMARY_CONFIG = os.getenv("SUMMARY_CONFIG", "FAST") + self.MAX_GPU_MEMORY = os.getenv("MAX_GPU_MEMORY", None) + def set_debug_mode(self, value: bool) -> None: """Set the debug mode value""" self.debug_mode = value diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index c9d8abd6d..7b901e89a 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -62,7 +62,6 @@ LLM_MODEL_CONFIG = { } # Load model config -ISLOAD_8BIT = True ISDEBUG = False VECTOR_SEARCH_TOP_K = 10 diff --git a/pilot/model/adapter.py b/pilot/model/adapter.py index 26cbe9371..73e2682e2 100644 --- a/pilot/model/adapter.py +++ b/pilot/model/adapter.py @@ -118,13 +118,7 @@ class ChatGLMAdapater(BaseLLMAdaper): def match(self, model_path: str): return "chatglm" in model_path - def loader( - self, - model_path: str, - from_pretrained_kwargs: dict, - device_map=None, - num_gpus=CFG.NUM_GPUS, - ): + def loader(self, model_path: str, from_pretrained_kwargs: dict): tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) if DEVICE != "cuda": @@ -133,6 +127,8 @@ class ChatGLMAdapater(BaseLLMAdaper): ).float() return model, tokenizer else: + device_map = None + num_gpus = torch.cuda.device_count() model = ( AutoModel.from_pretrained( model_path, trust_remote_code=True, **from_pretrained_kwargs @@ -141,9 +137,6 @@ class ChatGLMAdapater(BaseLLMAdaper): ) from accelerate import dispatch_model - # model = AutoModel.from_pretrained(model_path, trust_remote_code=True, - # **from_pretrained_kwargs).half() - # if device_map is None: device_map = auto_configure_device_map(num_gpus) diff --git a/pilot/server/llmserver.py b/pilot/server/llmserver.py index b02dc8525..6ffdd616f 100644 --- a/pilot/server/llmserver.py +++ b/pilot/server/llmserver.py @@ -39,7 +39,11 @@ class ModelWorker: print(f"Loading {model_name} LLM ModelServer in {device}! Please Wait......") self.ml = ModelLoader(model_path=model_path, model_name=self.model_name) self.model, self.tokenizer = self.ml.loader( - num_gpus, load_8bit=ISLOAD_8BIT, debug=ISDEBUG + num_gpus, + load_8bit=CFG.IS_LOAD_8BIT, + load_4bit=CFG.IS_LOAD_4BIT, + debug=ISDEBUG, + max_gpu_memory=CFG.MAX_GPU_MEMORY, ) if not isinstance(self.model, str):