From 846406a07a7d16a3d4b7f045c4c03fcb2e53c0cc Mon Sep 17 00:00:00 2001 From: ver217 Date: Sat, 16 Apr 2022 21:29:31 +0800 Subject: [PATCH] [gemini] fix auto tensor placement policy (#775) --- colossalai/gemini/tensor_placement_policy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/colossalai/gemini/tensor_placement_policy.py b/colossalai/gemini/tensor_placement_policy.py index e3f3dff3d..cabd0352f 100644 --- a/colossalai/gemini/tensor_placement_policy.py +++ b/colossalai/gemini/tensor_placement_policy.py @@ -46,7 +46,9 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy): def __init__(self, mem_stats_collector: Optional[MemStatsCollector] = None) -> None: super().__init__(None, mem_stats_collector=mem_stats_collector) # model data will use 1-self._warmup_non_model_data_ratio CUDA memory in warmup phase + # TODO(ver217): make these args configurable self._warmup_non_model_data_ratio: float = 0.8 + self._steady_cuda_cap_ratio: float = 0.8 def evict_tensors(self, hold_cuda_tensor_list: List[StatefulTensor], @@ -63,6 +65,7 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy): else: # max non-model-data cuda memory consumption of this sampling moment and the next sampling moment. max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage('cuda') + cuda_capacity *= self._steady_cuda_cap_ratio total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data if avail_cuda_model_data < cuda_demand: