diff --git a/pilot/model/adapter.py b/pilot/model/adapter.py index 3bfe608d8..2c1089cec 100644 --- a/pilot/model/adapter.py +++ b/pilot/model/adapter.py @@ -108,7 +108,7 @@ class GuanacoAdapter(BaseLLMAdaper): def loader(self, model_path: str, from_pretrained_kwargs: dict): tokenizer = LlamaTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( - model_path, load_in_4bit=True, device_map={"": 0}, **from_pretrained_kwargs + model_path, load_in_4bit=True, **from_pretrained_kwargs ) return model, tokenizer @@ -127,7 +127,6 @@ class FalconAdapater(BaseLLMAdaper): model_path, load_in_4bit=True, # quantize quantization_config=bnb_config, - device_map={"": 0}, trust_remote_code=True, **from_pretrained_kwagrs, ) @@ -135,7 +134,6 @@ class FalconAdapater(BaseLLMAdaper): model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, - device_map={"": 0}, **from_pretrained_kwagrs, ) return model, tokenizer diff --git a/pilot/model/loader.py b/pilot/model/loader.py index 6fd6143ff..6acbc9234 100644 --- a/pilot/model/loader.py +++ b/pilot/model/loader.py @@ -73,12 +73,12 @@ class ModelLoader(metaclass=Singleton): elif self.device == "cuda": kwargs = {"torch_dtype": torch.float16} - num_gpus = int(num_gpus) + num_gpus = torch.cuda.device_count() if num_gpus != 1: kwargs["device_map"] = "auto" - if max_gpu_memory is None: - kwargs["device_map"] = "sequential" + # if max_gpu_memory is None: + # kwargs["device_map"] = "sequential" available_gpu_memory = get_gpu_memory(num_gpus) kwargs["max_memory"] = {