mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-09 04:50:17 +00:00
[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
This commit is contained in:
49
colossalai/legacy/amp/naive_amp/_utils.py
Normal file
49
colossalai/legacy/amp/naive_amp/_utils.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from typing import List
|
||||
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def has_inf_or_nan(tensor):
|
||||
"""Check if tensor has inf or nan values.
|
||||
|
||||
Args:
|
||||
tensor (:class:`torch.Tensor`): a torch tensor object
|
||||
|
||||
Returns:
|
||||
bool: Whether the tensor has inf or nan. True for yes and False for no.
|
||||
"""
|
||||
try:
|
||||
# if tensor is half, the .float() incurs an additional deep copy, but it's necessary if
|
||||
# Pytorch's .sum() creates a one-element tensor of the same type as tensor
|
||||
# (which is true for some recent version of pytorch).
|
||||
tensor_sum = float(tensor.float().sum())
|
||||
# More efficient version that can be used if .sum() returns a Python scalar
|
||||
# tensor_sum = float(tensor.sum())
|
||||
except RuntimeError as instance:
|
||||
# We want to check if inst is actually an overflow exception.
|
||||
# RuntimeError could come from a different error.
|
||||
# If so, we still want the exception to propagate.
|
||||
if "value cannot be converted" not in instance.args[0]:
|
||||
raise
|
||||
return True
|
||||
else:
|
||||
if tensor_sum == float('inf') or tensor_sum == -float('inf') or tensor_sum != tensor_sum:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def zero_gard_by_list(tensor_list: List[Tensor], set_to_none: bool = True) -> None:
|
||||
"""Clear the gradient of a list of tensors,
|
||||
|
||||
Note: copied from torch.optim.optimizer.
|
||||
"""
|
||||
for param in tensor_list:
|
||||
if param.grad is not None:
|
||||
if set_to_none:
|
||||
param.grad = None
|
||||
else:
|
||||
if param.grad.grad_fn is not None:
|
||||
param.grad.detach_()
|
||||
else:
|
||||
param.grad.requires_grad_(False)
|
||||
param.grad.zero_()
|
Reference in New Issue
Block a user