[gemini] update ddp strict mode (#2518)

* [zero] add strict ddp mode for chunk init

* [gemini] update gpt example
This commit is contained in:
HELSON
2023-01-28 14:35:25 +08:00
committed by GitHub
parent 0af793836c
commit 707b11d4a0
16 changed files with 133 additions and 54 deletions

View File

@@ -2,6 +2,7 @@ import math
from typing import Dict, List, Optional, Tuple
import numpy as np
import torch.distributed as dist
import torch.nn as nn
from colossalai.gemini.memory_tracer import MemStats, OrderedParamGenerator
@@ -13,8 +14,14 @@ def _filter_exlarge_params(model: nn.Module, size_dict: Dict[int, List[int]]) ->
"""
Filter those parameters whose size is too large (more than 3x standard deviations) from others.
"""
params_size = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
params_size_arr = np.array(params_size)
agg_size_list = []
for key in size_dict:
agg_size_list.extend(size_dict[key])
if len(agg_size_list) == 0:
return
params_size_arr = np.array(agg_size_list)
std = np.std(params_size_arr)
mean = np.mean(params_size_arr)
@@ -38,7 +45,15 @@ def _get_unused_byte(size_list: List[int], chunk_size: int) -> int:
return left + acc
def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int, List[ColoParameter]]:
def _tensor_numel(local_param: ColoParameter, strict_ddp_flag: bool):
if strict_ddp_flag:
return local_param.numel_global()
else:
return local_param.numel()
def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
strict_ddp_flag: bool = False) -> Dict[int, List[ColoParameter]]:
"""classify_params_by_dp_degree
Classify the parameters by their dp degree
@@ -56,7 +71,10 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator) -> Dict[int
if is_ddp_ignored(param):
continue
param_key = param.process_group.dp_world_size()
if strict_ddp_flag:
param_key = dist.get_world_size()
else:
param_key = param.process_group.dp_world_size()
if param_key not in params_dict:
params_dict[param_key] = []
@@ -71,14 +89,18 @@ def search_chunk_configuration(
search_interval_byte: int, # hidden size is the best value for the interval
min_chunk_size_mb: float = 32,
filter_exlarge_params: bool = True,
memstas: Optional[MemStats] = None) -> Tuple[Dict, int]:
strict_ddp_flag: bool = False,
memstas: Optional[MemStats] = None) -> Tuple[Dict, int, int]:
"""search_chunk_configuration
Args:
model (nn.Module): torch module
search_range_mb (float): searching range in mega byte.
search_interval_byte (int): searching interval in byte.
min_chunk_size_mb (float, optional): the minimum size of a distributed chunk.
filter_exlarge_params (bool, optional): filter extreme large parameters. Defaults to True.
strict_ddp_flag (bool, optional): whether to enable the strict ddp mode.
all parameters keep replicated in this mode.
Returns:
Tuple[Dict, int]: chunk config (a dict of dp_degree -> chunk init args) and its memory chunk waste in byte.
@@ -96,17 +118,20 @@ def search_chunk_configuration(
min_chunk_size_byte = round(min_chunk_size_mb * 1024**2)
assert search_range_byte >= 0
params_dict = classify_params_by_dp_degree(param_order)
params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
config_dict: Dict[int, Dict] = dict()
total_param_size = 0
size_dict: Dict[int, List[int]] = dict()
for dp_degree in params_dict:
params_list = params_dict[dp_degree]
size_list = [p.numel() for p in params_list]
size_list = [_tensor_numel(p, strict_ddp_flag) for p in params_list]
group_acc_size = sum(size_list)
total_param_size += group_acc_size
# let small parameters keep gathered in CUDA all the time
total_size = sum(size_list)
if total_size < min_chunk_size_byte:
config_dict[dp_degree] = dict(chunk_size=total_size, keep_gathered=True)
if group_acc_size < min_chunk_size_byte:
config_dict[dp_degree] = dict(chunk_size=group_acc_size, keep_gathered=True)
else:
size_dict[dp_degree] = size_list
@@ -134,4 +159,4 @@ def search_chunk_configuration(
continue
config_dict[dp_degree] = dict(chunk_size=best_chunk_size, keep_gathered=False)
return config_dict, min_chunk_waste
return config_dict, total_param_size, min_chunk_waste

View File

@@ -19,38 +19,24 @@ def safe_div(a, b):
def init_chunk_manager(model: nn.Module,
init_device: Optional[torch.device] = None,
hidden_dim: Optional[int] = None,
search_range_mb: Optional[float] = None,
min_chunk_size_mb: Optional[float] = None,
filter_exlarge_params: Optional[bool] = None) -> ChunkManager:
kwargs_dict = dict()
**kwargs) -> ChunkManager:
if hidden_dim:
search_interval_byte = hidden_dim
else:
search_interval_byte = 1024 # 1kb
kwargs_dict["search_interval_byte"] = search_interval_byte
if search_range_mb:
kwargs_dict["search_range_mb"] = search_range_mb
if min_chunk_size_mb:
kwargs_dict["min_chunk_size_mb"] = min_chunk_size_mb
if filter_exlarge_params:
kwargs_dict["filter_exlarge_params"] = filter_exlarge_params
params_sizes = [p.numel() for p in model.parameters() if not is_ddp_ignored(p)]
total_size = sum(params_sizes) / 1024**2
search_interval_byte = 1024 # defaults to 1kb
kwargs["search_interval_byte"] = search_interval_byte
dist.barrier()
begin = time()
config_dict, wasted_size = search_chunk_configuration(model, **kwargs_dict)
config_dict, total_size, wasted_size = search_chunk_configuration(model, **kwargs)
dist.barrier()
end = time()
span_s = end - begin
wasted_size /= 1024**2
mb_size = 1024**2
total_size /= mb_size
wasted_size /= mb_size
if dist.get_rank() == 0:
print("searching chunk configuration is completed in {:.2f} s.\n".format(span_s),

View File

@@ -234,11 +234,14 @@ class ZeroDDP(ColoDDP):
for p in module.parameters():
param_order.append(p)
ddp_pg = ColoProcessGroup()
for p in param_order.generate():
assert isinstance(p, ColoParameter)
if strict_ddp_mode and not p.is_replicate():
p.set_dist_spec(ReplicaSpec())
if strict_ddp_mode:
if not p.is_replicate():
p.set_dist_spec(ReplicaSpec())
p.set_process_group(pg=ddp_pg)
if is_ddp_ignored(p):
p.data = p.data.to(device=get_current_device(), dtype=torch.float16)

View File

@@ -20,7 +20,7 @@ class GeminiDDP(ZeroDDP):
strict_ddp_mode: bool = False,
search_range_mb: int = 32,
hidden_dim: Optional[int] = None,
min_chunk_size_mb: Optional[float] = None,
min_chunk_size_mb: float = 32,
memstats: Optional[MemStats] = None) -> None:
"""
A torch.Module warpper using ZeRO-DP and Genimi.
@@ -53,6 +53,7 @@ class GeminiDDP(ZeroDDP):
init_device=device,
hidden_dim=hidden_dim,
search_range_mb=search_range_mb,
min_chunk_size_mb=min_chunk_size_mb)
min_chunk_size_mb=min_chunk_size_mb,
strict_ddp_flag=strict_ddp_mode)
gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
super().__init__(module, gemini_manager, pin_memory, force_outputs_fp32, strict_ddp_mode)

View File

@@ -1,3 +1,4 @@
import math
from copy import copy
from functools import lru_cache
from typing import Callable, Optional, Set
@@ -303,6 +304,11 @@ class ColoTensor(torch.Tensor):
else:
return size_list[args[0]]
def numel_global(self):
"""Returns the number of elements in the tensor when it's replicated.
"""
return math.prod(self.size_global())
# Some API for dist spec check
def is_replicate(self):