mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -19,13 +19,11 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
||||
|
||||
Raises:
|
||||
ImportError: Raise if ``tensornvme`` is not installed.
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
params,
|
||||
defaults: dict,
|
||||
nvme_offload_fraction: float = 0.0,
|
||||
offload_dir: Optional[str] = None) -> None:
|
||||
def __init__(
|
||||
self, params, defaults: dict, nvme_offload_fraction: float = 0.0, offload_dir: Optional[str] = None
|
||||
) -> None:
|
||||
assert 0.0 <= nvme_offload_fraction <= 1.0
|
||||
super().__init__(params, defaults)
|
||||
self.nvme_offload_fraction = float(nvme_offload_fraction)
|
||||
@@ -34,9 +32,9 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
||||
from tensornvme import DiskOffloader
|
||||
from tensornvme._C import get_backends
|
||||
except ModuleNotFoundError:
|
||||
raise ModuleNotFoundError('Please install tensornvme to use NVMeOptimizer')
|
||||
raise ModuleNotFoundError("Please install tensornvme to use NVMeOptimizer")
|
||||
self.offload_dir = offload_dir or tempfile.mkdtemp()
|
||||
backend = 'uring' if 'uring' in get_backends() else 'aio'
|
||||
backend = "uring" if "uring" in get_backends() else "aio"
|
||||
self.offloader = DiskOffloader(self.offload_dir, 8, backend=backend)
|
||||
else:
|
||||
self.offload_dir = None
|
||||
@@ -53,13 +51,17 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
||||
def _get_numel(self) -> int:
|
||||
numel = 0
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
for p in group["params"]:
|
||||
numel += p.storage().size()
|
||||
return numel
|
||||
|
||||
def _post_state_init(self, param: Parameter) -> None:
|
||||
numel = param.storage().size()
|
||||
if self.offloader is not None and param.device.type == 'cpu' and numel + self.offloaded_numel <= self.can_offload_numel:
|
||||
if (
|
||||
self.offloader is not None
|
||||
and param.device.type == "cpu"
|
||||
and numel + self.offloaded_numel <= self.can_offload_numel
|
||||
):
|
||||
self.is_on_nvme[param] = True
|
||||
self.offloaded_numel += numel
|
||||
else:
|
||||
@@ -70,11 +72,11 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
||||
return
|
||||
assert len(self.prefetch_params) == 0 and len(self.param_to_prefetch_idx) == 0
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
for p in group["params"]:
|
||||
if p.grad is None:
|
||||
continue
|
||||
if len(self.state[p]) > 0 and self.is_on_nvme[p]:
|
||||
assert p.device.type == 'cpu'
|
||||
assert p.device.type == "cpu"
|
||||
self.param_to_prefetch_idx[p] = len(self.prefetch_params)
|
||||
self.prefetch_params.append(p)
|
||||
|
||||
@@ -156,7 +158,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
||||
super().load_state_dict(state_dict)
|
||||
|
||||
def __del__(self) -> None:
|
||||
if getattr(self, 'offloader', None) is not None:
|
||||
if getattr(self, "offloader", None) is not None:
|
||||
del self.offloader
|
||||
if os.path.exists(self.offload_dir):
|
||||
try:
|
||||
|
Reference in New Issue
Block a user