mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-01 09:42:35 +00:00
[hotfix] fix chunk size can not be divided (#2867)
* [hotfix] fix chunk size can not be divided * [hotfix] use numpy for python3.8
This commit is contained in:
parent
a4fc125c34
commit
6e4ac08172
@ -72,6 +72,9 @@ class ChunkManager:
|
|||||||
|
|
||||||
if tensor.numel() > chunk_size:
|
if tensor.numel() > chunk_size:
|
||||||
chunk_size = tensor.numel()
|
chunk_size = tensor.numel()
|
||||||
|
dp_size = tensor.process_group.dp_world_size()
|
||||||
|
chunk_size = chunk_size + (-chunk_size % dp_size)
|
||||||
|
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
process_group=tensor.process_group,
|
process_group=tensor.process_group,
|
||||||
|
@ -119,6 +119,7 @@ def search_chunk_configuration(
|
|||||||
assert search_range_byte >= 0
|
assert search_range_byte >= 0
|
||||||
|
|
||||||
params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
|
params_dict = classify_params_by_dp_degree(param_order, strict_ddp_flag)
|
||||||
|
size_lcm = np.lcm.reduce(list(params_dict.keys()))
|
||||||
config_dict: Dict[int, Dict] = dict()
|
config_dict: Dict[int, Dict] = dict()
|
||||||
total_param_size = 0
|
total_param_size = 0
|
||||||
|
|
||||||
@ -154,6 +155,8 @@ def search_chunk_configuration(
|
|||||||
min_chunk_waste = temp_waste
|
min_chunk_waste = temp_waste
|
||||||
best_chunk_size = chunk_size
|
best_chunk_size = chunk_size
|
||||||
|
|
||||||
|
# the chunk size needs to be divided by each groups sizes
|
||||||
|
best_chunk_size = best_chunk_size + (-best_chunk_size % size_lcm)
|
||||||
for dp_degree in params_dict:
|
for dp_degree in params_dict:
|
||||||
if dp_degree in config_dict:
|
if dp_degree in config_dict:
|
||||||
continue
|
continue
|
||||||
|
Loading…
Reference in New Issue
Block a user