mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-01 01:33:10 +00:00
[hotfix] fix some bugs caused by size mismatch. (#1011)
* [CLI] add CLI launcher
* Revert "[CLI] add CLI launcher"
This reverts commit df7e6506d4
.
* [hotfix]fix some bugs caused by size mismatch.
* add warning logs
* polish
This commit is contained in:
parent
9833d814d5
commit
d182b0bd47
@ -23,6 +23,8 @@ def run_benchmark(args: Config) -> None:
|
|||||||
if args.gpus is None:
|
if args.gpus is None:
|
||||||
click.echo("Error: --num_gpus is not given")
|
click.echo("Error: --num_gpus is not given")
|
||||||
exit()
|
exit()
|
||||||
|
if args.gpus <= 1:
|
||||||
|
click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
|
||||||
|
|
||||||
click.echo("=== Benchmarking Parameters ===")
|
click.echo("=== Benchmarking Parameters ===")
|
||||||
for k, v in args.items():
|
for k, v in args.items():
|
||||||
@ -63,6 +65,13 @@ def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_
|
|||||||
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||||
timer = MultiTimer()
|
timer = MultiTimer()
|
||||||
|
|
||||||
|
# 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
|
||||||
|
if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
|
||||||
|
click.echo(
|
||||||
|
"1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
if hyperparams.model == 'mlp':
|
if hyperparams.model == 'mlp':
|
||||||
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
|
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
|
||||||
else:
|
else:
|
||||||
|
@ -48,9 +48,15 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _is_square(num):
|
def _is_square(num):
|
||||||
|
# 2D parallel should be implemented with at least 2 devices.
|
||||||
|
if num <= 1:
|
||||||
|
return False
|
||||||
return math.floor(math.sqrt(num))**2 == num
|
return math.floor(math.sqrt(num))**2 == num
|
||||||
|
|
||||||
def _is_cube(num):
|
def _is_cube(num):
|
||||||
|
# 3D parallel should be implemented with at least 2 devices.
|
||||||
|
if num <= 1:
|
||||||
|
return False
|
||||||
return math.floor(num**(1. / 3.))**3 == num
|
return math.floor(num**(1. / 3.))**3 == num
|
||||||
|
|
||||||
config_list = []
|
config_list = []
|
||||||
@ -63,7 +69,7 @@ def find_all_configs(device_cnt: int) -> List[Dict]:
|
|||||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
|
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
|
||||||
config_list.append(config)
|
config_list.append(config)
|
||||||
|
|
||||||
# add 1D config only if device_cnt is a square
|
# add 2D config only if device_cnt is a square
|
||||||
if _is_square(device_cnt):
|
if _is_square(device_cnt):
|
||||||
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
|
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
|
||||||
config_list.append(config)
|
config_list.append(config)
|
||||||
|
Loading…
Reference in New Issue
Block a user