mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-16 14:41:53 +00:00
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779)
* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc.
This commit is contained in:
@@ -46,7 +46,7 @@ detector.detect()
|
||||
|
||||
I have made some comments on the right of the output for your understanding.
|
||||
|
||||
Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
|
||||
Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`. PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
|
||||
|
||||
**The order of print is not equal to the order the tensor creates, but they are really close.**
|
||||
|
||||
@@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot
|
||||
+ mlp.2.bias cuda:0 (32,) True torch.float32 128 B
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Detect Location: "test_tensor_detector.py" line 27
|
||||
Totle GPU Memery Allocated on cuda:0 is 4.5 KB
|
||||
Total GPU Memory Allocated on cuda:0 is 4.5 KB
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB
|
||||
+ Tensor cuda:0 (32,) True torch.float32 128 B # output
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Detect Location: "test_tensor_detector.py" line 30
|
||||
Totle GPU Memery Allocated on cuda:0 is 5.5 KB
|
||||
Total GPU Memory Allocated on cuda:0 is 5.5 KB
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB
|
||||
+ Tensor cuda:0 () True torch.float32 4 B # loss
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Detect Location: "test_tensor_detector.py" line 32
|
||||
Totle GPU Memery Allocated on cuda:0 is 6.0 KB
|
||||
Total GPU Memory Allocated on cuda:0 is 6.0 KB
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB
|
||||
- Tensor cuda:0 (8,) True torch.float32 32 B # deleted activation
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Detect Location: "test_tensor_detector.py" line 34
|
||||
Totle GPU Memery Allocated on cuda:0 is 10.0 KB
|
||||
Total GPU Memory Allocated on cuda:0 is 10.0 KB
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB
|
||||
+ Tensor cuda:0 (32,) False torch.float32 128 B
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
Detect Location: "test_tensor_detector.py" line 36
|
||||
Totle GPU Memery Allocated on cuda:0 is 14.0 KB
|
||||
Total GPU Memory Allocated on cuda:0 is 14.0 KB
|
||||
------------------------------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
|
@@ -55,7 +55,7 @@ class TensorDetector():
|
||||
return self.mem_format(memory_size)
|
||||
|
||||
def mem_format(self, real_memory_size):
|
||||
# format the tensor memory into a reasonal magnitude
|
||||
# format the tensor memory into a reasonable magnitude
|
||||
if real_memory_size >= 2**30:
|
||||
return str(real_memory_size / (2**30)) + ' GB'
|
||||
if real_memory_size >= 2**20:
|
||||
@@ -71,7 +71,7 @@ class TensorDetector():
|
||||
if (not self.include_cpu) and obj.device == torch.device('cpu'):
|
||||
continue
|
||||
self.detected.append(id(obj))
|
||||
# skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch
|
||||
# skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch
|
||||
if id(obj) not in self.tensor_info:
|
||||
|
||||
name = type(obj).__name__
|
||||
@@ -84,7 +84,7 @@ class TensorDetector():
|
||||
name = par_name + ' (with grad)'
|
||||
else:
|
||||
# with no grad attached
|
||||
# there will be no new paramters created during running
|
||||
# there will be no new parameters created during running
|
||||
# so it must be in saved_tensor_info
|
||||
continue
|
||||
# we can also marked common tensors as tensor(with grad)
|
||||
@@ -155,7 +155,7 @@ class TensorDetector():
|
||||
if device == torch.device('cpu'):
|
||||
continue
|
||||
gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device))
|
||||
self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n"
|
||||
self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n"
|
||||
self.info += LINE
|
||||
self.info += '\n\n'
|
||||
if self.show_info:
|
||||
|
Reference in New Issue
Block a user