mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-10 13:30:19 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -44,8 +44,8 @@ def shuffle_by_round_robin(tensor_list, num_partitions):
|
||||
for partition_id in range(partitions_count):
|
||||
partition_tensors = partitions[partition_id]
|
||||
for item in partition_tensors:
|
||||
tensor_index_mapping[item['index']] = len(new_tensor_list)
|
||||
new_tensor_list.append(item['tensor'])
|
||||
tensor_index_mapping[item["index"]] = len(new_tensor_list)
|
||||
new_tensor_list.append(item["tensor"])
|
||||
|
||||
return new_tensor_list, tensor_index_mapping
|
||||
|
||||
@@ -107,11 +107,13 @@ def split_by_dtype(tensor_list):
|
||||
return buckets
|
||||
|
||||
|
||||
def reduce_tensor_dp_group(tensor: torch.Tensor,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
dst_local_rank: Optional[int] = None,
|
||||
dst_global_rank: Optional[int] = None,
|
||||
group: Optional[dist.ProcessGroup] = None):
|
||||
def reduce_tensor_dp_group(
|
||||
tensor: torch.Tensor,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
dst_local_rank: Optional[int] = None,
|
||||
dst_global_rank: Optional[int] = None,
|
||||
group: Optional[dist.ProcessGroup] = None,
|
||||
):
|
||||
"""
|
||||
Reduce the tensor in the data parallel process group
|
||||
|
||||
@@ -173,7 +175,7 @@ def has_inf_or_nan(tensor):
|
||||
raise
|
||||
return True
|
||||
else:
|
||||
if tensor_sum == float('inf') or tensor_sum == -float('inf') or tensor_sum != tensor_sum:
|
||||
if tensor_sum == float("inf") or tensor_sum == -float("inf") or tensor_sum != tensor_sum:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -184,8 +186,7 @@ def release_param_grad(tensor_list):
|
||||
|
||||
|
||||
def calculate_global_norm_from_list(norm_list):
|
||||
""" Compute total from a list of norms
|
||||
"""
|
||||
"""Compute total from a list of norms"""
|
||||
total_norm = 0.0
|
||||
for norm in norm_list:
|
||||
total_norm += norm**2.0
|
||||
@@ -221,7 +222,7 @@ def compute_norm(gradients: Tensor, dp_group: ProcessGroup, tp_group: ProcessGro
|
||||
total_norm = 0.0
|
||||
for g in gradients:
|
||||
param_norm = g.data.double().norm(2)
|
||||
total_norm += param_norm.item()**2
|
||||
total_norm += param_norm.item() ** 2
|
||||
|
||||
# Sum across all model parallel GPUs.
|
||||
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
|
||||
@@ -230,9 +231,9 @@ def compute_norm(gradients: Tensor, dp_group: ProcessGroup, tp_group: ProcessGro
|
||||
if tp_group is not None:
|
||||
dist.all_reduce(tensor=total_norm_cuda, op=torch.distributed.ReduceOp.SUM, group=tp_group)
|
||||
|
||||
total_norm = total_norm_cuda[0].item()**(1. / norm_type)
|
||||
total_norm = total_norm_cuda[0].item() ** (1.0 / norm_type)
|
||||
|
||||
if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
|
||||
if total_norm == float("inf") or total_norm == -float("inf") or total_norm != total_norm:
|
||||
total_norm = -1
|
||||
|
||||
return total_norm
|
||||
|
Reference in New Issue
Block a user