mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-04 18:40:28 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -17,10 +17,9 @@ from .memory_tracer import ChunkMemStatsCollector
|
||||
class PlacementPolicy(ABC):
|
||||
need_mem_stats: bool = False
|
||||
|
||||
def __init__(self,
|
||||
chunk_manager: ChunkManager,
|
||||
mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
|
||||
**kwargs) -> None:
|
||||
def __init__(
|
||||
self, chunk_manager: ChunkManager, mem_stats_collector: Optional[ChunkMemStatsCollector] = None, **kwargs
|
||||
) -> None:
|
||||
self.chunk_manager = chunk_manager
|
||||
self.mem_stats_collector: Optional[ChunkMemStatsCollector] = mem_stats_collector
|
||||
|
||||
@@ -29,23 +28,25 @@ class PlacementPolicy(ABC):
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
|
||||
torch.device]) -> None:
|
||||
def setup_grads_device(
|
||||
self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
|
||||
) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class StaticPlacementPolicy(PlacementPolicy):
|
||||
|
||||
def __init__(self,
|
||||
chunk_manager: ChunkManager,
|
||||
mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
|
||||
shard_param_frac: float = 1.0,
|
||||
offload_optim_frac: float = 0.0,
|
||||
offload_param_frac: float = 0.0,
|
||||
**kwargs) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
chunk_manager: ChunkManager,
|
||||
mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
|
||||
shard_param_frac: float = 1.0,
|
||||
offload_optim_frac: float = 0.0,
|
||||
offload_param_frac: float = 0.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
|
||||
if offload_param_frac > 0.0 and (shard_param_frac != 1.0 or offload_optim_frac != 1.0):
|
||||
warnings.warn('offload_param_frac is ignored when shard_param_frac != 1.0 or offload_optim_frac != 1.0')
|
||||
warnings.warn("offload_param_frac is ignored when shard_param_frac != 1.0 or offload_optim_frac != 1.0")
|
||||
offload_param_frac = 0.0
|
||||
self.shard_param_frac = shard_param_frac
|
||||
self.offload_optim_frac = offload_optim_frac
|
||||
@@ -66,13 +67,14 @@ class StaticPlacementPolicy(PlacementPolicy):
|
||||
for chunk in can_evict_chunks:
|
||||
if can_offload_chunk_mem <= self.keep_cuda_chunk_mem:
|
||||
break
|
||||
self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
|
||||
self.chunk_manager.move_chunk(chunk, torch.device("cpu"))
|
||||
# real saved mem is shard_mem, for simplicity we use chunk_mem
|
||||
can_offload_chunk_mem -= chunk.chunk_mem
|
||||
return 0, 0.0
|
||||
|
||||
def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
|
||||
torch.device]) -> None:
|
||||
def setup_grads_device(
|
||||
self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
|
||||
) -> None:
|
||||
total_chunk_mem = sum(self.chunk_manager.get_chunk(p).chunk_mem for p in params)
|
||||
|
||||
offload_optim_chunk_mem = total_chunk_mem * self.offload_optim_frac
|
||||
@@ -85,7 +87,7 @@ class StaticPlacementPolicy(PlacementPolicy):
|
||||
if chunk.keep_gathered or offloaded_optim_chunk_mem >= offload_optim_chunk_mem:
|
||||
device = get_current_device()
|
||||
else:
|
||||
device = torch.device('cpu')
|
||||
device = torch.device("cpu")
|
||||
# real offloaded mem is chunk.shard_mem, for simplicity we use chunk mem here
|
||||
offloaded_optim_chunk_mem += chunk.chunk_mem
|
||||
for p in params:
|
||||
@@ -97,12 +99,14 @@ class StaticPlacementPolicy(PlacementPolicy):
|
||||
class AutoPlacementPolicy(PlacementPolicy):
|
||||
need_mem_stats: bool = True
|
||||
|
||||
def __init__(self,
|
||||
chunk_manager: ChunkManager,
|
||||
mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
|
||||
warmup_non_model_data_ratio: float = 0.8,
|
||||
steady_cuda_cap_ratio: float = 0.9,
|
||||
**kwargs) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
chunk_manager: ChunkManager,
|
||||
mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
|
||||
warmup_non_model_data_ratio: float = 0.8,
|
||||
steady_cuda_cap_ratio: float = 0.9,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
|
||||
# model data will use 1-_warmup_non_model_data_ratio CUDA memory in warmup phase
|
||||
# you can set them by AutoPlacementPolicy.set_warmup_non_model_data_ratio()
|
||||
@@ -110,13 +114,15 @@ class AutoPlacementPolicy(PlacementPolicy):
|
||||
self._warmup_non_model_data_ratio = warmup_non_model_data_ratio
|
||||
self._steady_cuda_cap_ratio = steady_cuda_cap_ratio
|
||||
|
||||
def evict_tensors(self,
|
||||
can_evict_chunks: List[Chunk],
|
||||
cuda_demand: int = 0,
|
||||
warmup: bool = True,
|
||||
compute_list: Optional[List[Tuple[Chunk, ...]]] = None,
|
||||
compute_idx: int = 0,
|
||||
**kwargs) -> Tuple[int, float]:
|
||||
def evict_tensors(
|
||||
self,
|
||||
can_evict_chunks: List[Chunk],
|
||||
cuda_demand: int = 0,
|
||||
warmup: bool = True,
|
||||
compute_list: Optional[List[Tuple[Chunk, ...]]] = None,
|
||||
compute_idx: int = 0,
|
||||
**kwargs,
|
||||
) -> Tuple[int, float]:
|
||||
"""
|
||||
Evict tensors from CUDA device.
|
||||
|
||||
@@ -135,13 +141,13 @@ class AutoPlacementPolicy(PlacementPolicy):
|
||||
"""
|
||||
start = time()
|
||||
cuda_capacity = colo_device_memory_capacity(get_current_device())
|
||||
used_cuda_model_data = self.chunk_manager.total_mem['cuda']
|
||||
used_cuda_model_data = self.chunk_manager.total_mem["cuda"]
|
||||
if warmup:
|
||||
# We designate a part of CUDA memory for model data in warmup iterations.
|
||||
max_cuda_non_model_data_per_period = cuda_capacity * self._warmup_non_model_data_ratio
|
||||
else:
|
||||
# max non-model-data cuda memory consumption of this sampling moment and the next sampling moment.
|
||||
max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage('cuda')
|
||||
max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage("cuda")
|
||||
cuda_capacity *= self._steady_cuda_cap_ratio
|
||||
total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
|
||||
avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
|
||||
@@ -160,11 +166,13 @@ class AutoPlacementPolicy(PlacementPolicy):
|
||||
break
|
||||
|
||||
self.chunk_manager.release_chunk(chunk)
|
||||
self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
|
||||
self.chunk_manager.move_chunk(chunk, torch.device("cpu"))
|
||||
freed_cuda_model_data += chunk.chunk_mem
|
||||
if freed_cuda_model_data < to_free_cuda_model_data:
|
||||
raise RuntimeError(f"Adjust layout failed! No enough CUDA memory! "
|
||||
f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}")
|
||||
raise RuntimeError(
|
||||
f"Adjust layout failed! No enough CUDA memory! "
|
||||
f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
|
||||
)
|
||||
return freed_cuda_model_data, time() - start
|
||||
|
||||
@staticmethod
|
||||
@@ -178,8 +186,9 @@ class AutoPlacementPolicy(PlacementPolicy):
|
||||
next_compute_idx = sorted(next_compute_idx.items(), key=lambda pair: pair[1], reverse=True)
|
||||
return [t for (t, idx) in next_compute_idx]
|
||||
|
||||
def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
|
||||
torch.device]) -> None:
|
||||
def setup_grads_device(
|
||||
self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor, torch.device]
|
||||
) -> None:
|
||||
for p in params:
|
||||
chunk = self.chunk_manager.get_chunk(p)
|
||||
# init offload optim settings
|
||||
@@ -187,13 +196,13 @@ class AutoPlacementPolicy(PlacementPolicy):
|
||||
if chunk.keep_gathered:
|
||||
grads_device_map[p] = get_current_device()
|
||||
else:
|
||||
grads_device_map[p] = torch.device('cpu')
|
||||
grads_device_map[p] = torch.device("cpu")
|
||||
|
||||
|
||||
class PlacementPolicyFactory:
|
||||
policies: Dict[str, Type[PlacementPolicy]] = {
|
||||
'auto': AutoPlacementPolicy,
|
||||
'static': StaticPlacementPolicy,
|
||||
"auto": AutoPlacementPolicy,
|
||||
"static": StaticPlacementPolicy,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
Reference in New Issue
Block a user