mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-05 03:26:48 +00:00
[gemini] polish code (#855)
This commit is contained in:
parent
29159d9b5b
commit
f0e654558f
@ -42,7 +42,7 @@ class StatefulTensorMgr(object):
|
|||||||
by mem_stats_collector, which should belongs to a Sharded Model.
|
by mem_stats_collector, which should belongs to a Sharded Model.
|
||||||
"""
|
"""
|
||||||
# find stateful tensor in state COMPUTE
|
# find stateful tensor in state COMPUTE
|
||||||
cuda_demand = 0
|
cuda_demand = StatefulTensor.GST_MGR.state_mem['cpu'][TensorState.COMPUTE]
|
||||||
move_to_cuda_tensor_list = []
|
move_to_cuda_tensor_list = []
|
||||||
hold_cuda_tensor_list = []
|
hold_cuda_tensor_list = []
|
||||||
for tensor in self._stateful_tensor_list:
|
for tensor in self._stateful_tensor_list:
|
||||||
@ -55,7 +55,6 @@ class StatefulTensorMgr(object):
|
|||||||
elif tensor.device.type == 'cpu':
|
elif tensor.device.type == 'cpu':
|
||||||
if tensor.state == TensorState.COMPUTE:
|
if tensor.state == TensorState.COMPUTE:
|
||||||
move_to_cuda_tensor_list.append(tensor)
|
move_to_cuda_tensor_list.append(tensor)
|
||||||
cuda_demand += colo_tensor_mem_usage(tensor.payload)[1]
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError
|
raise RuntimeError
|
||||||
self._cpu_gpu_move_volume += self._tensor_placement_policy.evict_tensors(hold_cuda_tensor_list,
|
self._cpu_gpu_move_volume += self._tensor_placement_policy.evict_tensors(hold_cuda_tensor_list,
|
||||||
@ -66,7 +65,7 @@ class StatefulTensorMgr(object):
|
|||||||
# move COMPUTE tensors to CUDA
|
# move COMPUTE tensors to CUDA
|
||||||
for t in move_to_cuda_tensor_list:
|
for t in move_to_cuda_tensor_list:
|
||||||
colo_model_data_tensor_move_inline(t, get_current_device())
|
colo_model_data_tensor_move_inline(t, get_current_device())
|
||||||
self._cpu_gpu_move_volume += t.payload.numel() * t.payload.element_size()
|
self._cpu_gpu_move_volume += t.payload_size
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def cpu_gpu_move_volume(self):
|
def cpu_gpu_move_volume(self):
|
||||||
|
@ -76,7 +76,6 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
|
|||||||
Returns:
|
Returns:
|
||||||
int: the volume of memory that is evicted
|
int: the volume of memory that is evicted
|
||||||
"""
|
"""
|
||||||
volume = 0
|
|
||||||
cuda_capacity = colo_device_memory_capacity(get_current_device())
|
cuda_capacity = colo_device_memory_capacity(get_current_device())
|
||||||
used_cuda_model_data = StatefulTensor.GST_MGR.total_mem['cuda']
|
used_cuda_model_data = StatefulTensor.GST_MGR.total_mem['cuda']
|
||||||
if warmup:
|
if warmup:
|
||||||
@ -88,11 +87,12 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
|
|||||||
cuda_capacity *= self._steady_cuda_cap_ratio
|
cuda_capacity *= self._steady_cuda_cap_ratio
|
||||||
total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
|
total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
|
||||||
avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
|
avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
|
||||||
|
|
||||||
|
freed_cuda_model_data = 0
|
||||||
if avail_cuda_model_data < cuda_demand:
|
if avail_cuda_model_data < cuda_demand:
|
||||||
# Move cuda_demand - avail_cuda_model_data volume of tensors
|
# Move cuda_demand - avail_cuda_model_data volume of tensors
|
||||||
# to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
|
# to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
|
||||||
to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
|
to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
|
||||||
freed_cuda_model_data = 0
|
|
||||||
to_free_tensor_list = hold_cuda_tensor_list
|
to_free_tensor_list = hold_cuda_tensor_list
|
||||||
if not warmup:
|
if not warmup:
|
||||||
next_compute_idx = {t: len(compute_list) for t in hold_cuda_tensor_list}
|
next_compute_idx = {t: len(compute_list) for t in hold_cuda_tensor_list}
|
||||||
@ -104,15 +104,14 @@ class AutoTensorPlacementPolicy(TensorPlacementPolicy):
|
|||||||
for t in to_free_tensor_list:
|
for t in to_free_tensor_list:
|
||||||
if freed_cuda_model_data >= to_free_cuda_model_data:
|
if freed_cuda_model_data >= to_free_cuda_model_data:
|
||||||
break
|
break
|
||||||
freed_cuda_model_data += colo_tensor_mem_usage(t)[0]
|
freed_cuda_model_data += t.payload_size
|
||||||
colo_model_data_tensor_move_inline(t, torch.device('cpu'))
|
colo_model_data_tensor_move_inline(t, torch.device('cpu'))
|
||||||
volume += t.payload.numel() * t.payload.element_size()
|
|
||||||
if freed_cuda_model_data < to_free_cuda_model_data:
|
if freed_cuda_model_data < to_free_cuda_model_data:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Adjust layout failed! No enough CUDA memory! Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
|
f"Adjust layout failed! No enough CUDA memory! Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return volume
|
return freed_cuda_model_data
|
||||||
|
|
||||||
|
|
||||||
class TensorPlacementPolicyFactory:
|
class TensorPlacementPolicyFactory:
|
||||||
|
Loading…
Reference in New Issue
Block a user