mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 01:28:31 +00:00
[hotfix] Fix the bug where process groups were not being properly released. (#4940)
* Fix the bug where process groups were not being properly released.
* test
* Revert "test"
This reverts commit 479900c139
.
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import gc
|
||||
import itertools
|
||||
from functools import reduce
|
||||
from operator import mul
|
||||
@@ -44,6 +45,24 @@ class ProcessGroupMesh:
|
||||
self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {}
|
||||
self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {}
|
||||
|
||||
def __del__(self):
|
||||
r"""
|
||||
Destructor method for the ProcessGroupMesh class.
|
||||
|
||||
When the ProcessGroupMesh object is deleted or goes out of scope, this method is called. It is responsible for
|
||||
cleaning up any process groups that were created during the lifetime of the object.
|
||||
|
||||
Note:
|
||||
All process groups in PyTorch are represented as global variables, and they may not be automatically destroyed
|
||||
when the ProcessGroupMesh's lifetime ends. This method manually destroys the process groups to release
|
||||
system resources.
|
||||
"""
|
||||
for group in self._ranks_to_group.values():
|
||||
dist.destroy_process_group(group)
|
||||
|
||||
# Manually clear all process groups to save memory
|
||||
gc.collect()
|
||||
|
||||
@property
|
||||
def shape(self) -> Tuple[int, ...]:
|
||||
return self._shape
|
||||
|
Reference in New Issue
Block a user