[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-09-09 04:50:17 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/colossalai/legacy/amp/naive_amp/_utils.py
+++ b/colossalai/legacy/amp/naive_amp/_utils.py
@@ -0,0 +1,49 @@
+from typing import List
+
+from torch import Tensor
+
+
+def has_inf_or_nan(tensor):
+    """Check if tensor has inf or nan values.
+
+    Args:
+        tensor (:class:`torch.Tensor`): a torch tensor object
+
+    Returns:
+        bool: Whether the tensor has inf or nan. True for yes and False for no.
+    """
+    try:
+        # if tensor is half, the .float() incurs an additional deep copy, but it's necessary if
+        # Pytorch's .sum() creates a one-element tensor of the same type as tensor
+        # (which is true for some recent version of pytorch).
+        tensor_sum = float(tensor.float().sum())
+        # More efficient version that can be used if .sum() returns a Python scalar
+        # tensor_sum = float(tensor.sum())
+    except RuntimeError as instance:
+        # We want to check if inst is actually an overflow exception.
+        # RuntimeError could come from a different error.
+        # If so, we still want the exception to propagate.
+        if "value cannot be converted" not in instance.args[0]:
+            raise
+        return True
+    else:
+        if tensor_sum == float('inf') or tensor_sum == -float('inf') or tensor_sum != tensor_sum:
+            return True
+        return False
+
+
+def zero_gard_by_list(tensor_list: List[Tensor], set_to_none: bool = True) -> None:
+    """Clear the gradient of a list of tensors,
+
+    Note: copied from torch.optim.optimizer.
+    """
+    for param in tensor_list:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()