feat: Modify config for quantization and doc

2025-09-17 23:18:20 +00:00 · 2023-08-02 19:29:25 +08:00
parent d8a4b776d5
commit bceb609cf6
6 changed files with 51 additions and 34 deletions
--- a/pilot/server/llmserver.py
+++ b/pilot/server/llmserver.py
@@ -39,7 +39,11 @@ class ModelWorker:
        print(f"Loading {model_name} LLM ModelServer in {device}! Please Wait......")
        self.ml = ModelLoader(model_path=model_path, model_name=self.model_name)
        self.model, self.tokenizer = self.ml.loader(
-            num_gpus, load_8bit=ISLOAD_8BIT, debug=ISDEBUG
+            num_gpus,
+            load_8bit=CFG.IS_LOAD_8BIT,
+            load_4bit=CFG.IS_LOAD_4BIT,
+            debug=ISDEBUG,
+            max_gpu_memory=CFG.MAX_GPU_MEMORY,
        )

        if not isinstance(self.model, str):