mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-19 12:12:46 +00:00
polished output format for communication profiler and pcie profiler (#404)
fixed typing error
This commit is contained in:
parent
aaead33cfe
commit
dfd0363f68
@ -6,20 +6,25 @@ from torch.autograd.profiler import profile
|
|||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.distributed import ReduceOp
|
from torch.distributed import ReduceOp
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith
|
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
def _get_code_location(depth: int):
|
def _get_code_location(depth: int):
|
||||||
ret = ""
|
ret = []
|
||||||
length = len(inspect.stack())
|
length = min(len(inspect.stack()), depth + 1)
|
||||||
for i in range(3, min(length, depth + 1)):
|
for i in range(3, length):
|
||||||
upper_frame = inspect.stack()[i]
|
upper_frame = inspect.stack()[i]
|
||||||
function_name = inspect.stack()[i - 1].function
|
function_name = inspect.stack()[i - 1].function
|
||||||
info = upper_frame.filename + "(" + str(upper_frame.lineno) + "): " + function_name + "\n"
|
ret.append(upper_frame.filename)
|
||||||
ret += info
|
ret.append('(')
|
||||||
|
ret.append(str(upper_frame.lineno))
|
||||||
|
ret.append('): ')
|
||||||
|
ret.append(function_name)
|
||||||
|
if i != length - 1:
|
||||||
|
ret.append('\n')
|
||||||
|
|
||||||
return ret
|
return ''.join(ret)
|
||||||
|
|
||||||
|
|
||||||
torch_all_reduce = dist.all_reduce
|
torch_all_reduce = dist.all_reduce
|
||||||
@ -100,7 +105,8 @@ class CommProfiler(BaseProfiler):
|
|||||||
def result_list(self, sep: str = "\n"):
|
def result_list(self, sep: str = "\n"):
|
||||||
res = []
|
res = []
|
||||||
|
|
||||||
def append(s: str):
|
def append(s: str = None):
|
||||||
|
if s is not None:
|
||||||
res.append(s)
|
res.append(s)
|
||||||
res.append(sep)
|
res.append(sep)
|
||||||
|
|
||||||
@ -110,19 +116,26 @@ class CommProfiler(BaseProfiler):
|
|||||||
|
|
||||||
append("Collective communication profiling result:")
|
append("Collective communication profiling result:")
|
||||||
append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
|
append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
|
||||||
append("average bandwith: {}".format(_format_bandwith(self.total_comm_vol, self.total_cuda_time)))
|
append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))
|
||||||
append("total number of calls: {}".format(self.total_count))
|
append("total number of calls: {}".format(self.total_count))
|
||||||
append("All events:\n----------------------------------------")
|
append("All events:")
|
||||||
|
|
||||||
|
seperation = '-' * 74
|
||||||
|
row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
|
||||||
|
|
||||||
|
append(seperation)
|
||||||
|
append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
|
||||||
|
append(seperation)
|
||||||
|
|
||||||
show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
|
show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
|
||||||
for location, event in show_list:
|
for location, event in show_list:
|
||||||
append(location)
|
append(location)
|
||||||
append("self cuda time: {}".format(_format_time(event.self_cuda_time)))
|
append(
|
||||||
append("{:.1f}% of total communication time".format(event.self_cuda_time / self.total_cuda_time * 100.0))
|
row_format.format('', _format_time(event.self_cuda_time),
|
||||||
append("self communication volme: {}".format(_format_memory(event.self_comm_vol)))
|
'{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),
|
||||||
append("average bandwith: {}".format(_format_bandwith(event.self_comm_vol, event.self_cuda_time)))
|
_format_memory(event.self_comm_vol),
|
||||||
append("number of calls: {}".format(event.self_count))
|
_format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))
|
||||||
append("----------------------------------------")
|
append()
|
||||||
|
|
||||||
return ''.join(res)
|
return ''.join(res)
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from torch.autograd.profiler import profile
|
from torch.autograd.profiler import profile
|
||||||
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith
|
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
@ -24,6 +24,7 @@ def _reduce_location(locations: List[str]) -> str:
|
|||||||
for lo in locations:
|
for lo in locations:
|
||||||
ret.append(lo)
|
ret.append(lo)
|
||||||
ret.append("\n")
|
ret.append("\n")
|
||||||
|
ret = ret[:-1]
|
||||||
return ''.join(ret)
|
return ''.join(ret)
|
||||||
|
|
||||||
|
|
||||||
@ -48,18 +49,23 @@ class PcieProfiler(BaseProfiler):
|
|||||||
TODO: Merge pcie profiler into communication profiler
|
TODO: Merge pcie profiler into communication profiler
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self, dtype: str = "fp32", depth: int = 1):
|
||||||
dtype: str = "fp32",
|
|
||||||
depth: int = 1,
|
|
||||||
total_count: int = 0,
|
|
||||||
total_pcie_vol: int = 0,
|
|
||||||
total_cuda_time: int = 0):
|
|
||||||
super().__init__(profiler_name="Pcie", priority=10)
|
super().__init__(profiler_name="Pcie", priority=10)
|
||||||
self.depth = depth
|
self.depth = depth
|
||||||
self.data_size = _get_size(dtype)
|
self.data_size = _get_size(dtype)
|
||||||
self.total_count = total_count
|
self.h2d_count = 0
|
||||||
self.total_pcie_vol = total_pcie_vol
|
self.h2d_time = 0
|
||||||
self.total_cuda_time = total_cuda_time
|
self.d2h_count = 0
|
||||||
|
self.d2h_time = 0
|
||||||
|
|
||||||
|
self.ops_record = dict()
|
||||||
|
self.profiler = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.h2d_count = 0
|
||||||
|
self.h2d_time = 0
|
||||||
|
self.d2h_count = 0
|
||||||
|
self.d2h_time = 0
|
||||||
|
|
||||||
self.ops_record = dict()
|
self.ops_record = dict()
|
||||||
self.profiler = None
|
self.profiler = None
|
||||||
@ -81,17 +87,20 @@ class PcieProfiler(BaseProfiler):
|
|||||||
for event in events:
|
for event in events:
|
||||||
if event.name == "aten::copy_":
|
if event.name == "aten::copy_":
|
||||||
t_shape = event.input_shapes[0]
|
t_shape = event.input_shapes[0]
|
||||||
if len(t_shape) == 0 or event.cuda_time_total == 0:
|
if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:
|
||||||
continue
|
continue
|
||||||
current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
|
current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
|
||||||
self.total_count += current_comm_event.count
|
|
||||||
self.total_pcie_vol += current_comm_event.pcie_vol
|
|
||||||
self.total_cuda_time += current_comm_event.cuda_time
|
|
||||||
code_location = _reduce_location(event.stack[:self.depth])
|
code_location = _reduce_location(event.stack[:self.depth])
|
||||||
if code_location in self.ops_record:
|
if code_location in self.ops_record:
|
||||||
self.ops_record[code_location].add(current_comm_event)
|
self.ops_record[code_location].add(current_comm_event)
|
||||||
else:
|
else:
|
||||||
self.ops_record[code_location] = current_comm_event
|
self.ops_record[code_location] = current_comm_event
|
||||||
|
elif 'Memcpy HtoD' in event.name:
|
||||||
|
self.h2d_count += 1
|
||||||
|
self.h2d_time += event.cuda_time_total
|
||||||
|
elif 'Memcpy DtoH' in event.name:
|
||||||
|
self.d2h_count += 1
|
||||||
|
self.d2h_time += event.cuda_time_total
|
||||||
|
|
||||||
self.profiler = None
|
self.profiler = None
|
||||||
|
|
||||||
@ -108,24 +117,32 @@ class PcieProfiler(BaseProfiler):
|
|||||||
def result_list(self, sep: str = "\n"):
|
def result_list(self, sep: str = "\n"):
|
||||||
res = []
|
res = []
|
||||||
|
|
||||||
def append(s: str):
|
def append(s: str = None):
|
||||||
|
if s is not None:
|
||||||
res.append(s)
|
res.append(s)
|
||||||
res.append(sep)
|
res.append(sep)
|
||||||
|
|
||||||
append("Pcie profiling result:")
|
append("Pcie profiling result:")
|
||||||
append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
|
append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))
|
||||||
append("average bandwith: {}".format(_format_bandwith(self.total_pcie_vol, self.total_cuda_time)))
|
append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))
|
||||||
append("total number of calls: {}".format(self.total_count))
|
append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))
|
||||||
append("All events:\n----------------------------------------")
|
append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))
|
||||||
|
|
||||||
|
append("Possible data transmission events in PCIE:")
|
||||||
|
|
||||||
|
seperation = '-' * 62
|
||||||
|
row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
|
||||||
|
|
||||||
|
append(seperation)
|
||||||
|
append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
|
||||||
|
append(seperation)
|
||||||
|
|
||||||
show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
|
show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
|
||||||
for location, event in show_list:
|
for location, event in show_list:
|
||||||
append(location)
|
append(location)
|
||||||
append("cuda time: {}".format(_format_time(event.cuda_time)))
|
append(
|
||||||
append("{:.1f}% of total pcie time".format(event.cuda_time / self.total_cuda_time * 100.0))
|
row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),
|
||||||
append("pcie volme: {}".format(_format_memory(event.pcie_vol)))
|
_format_bandwidth(event.pcie_vol, event.cuda_time), event.count))
|
||||||
append("average bandwith: {}".format(_format_bandwith(event.pcie_vol, event.cuda_time)))
|
append()
|
||||||
append("number of calls: {}".format(event.count))
|
|
||||||
append("----------------------------------------")
|
|
||||||
|
|
||||||
return ''.join(res)
|
return ''.join(res)
|
||||||
|
@ -32,7 +32,7 @@ def _format_memory(nbytes):
|
|||||||
return str(nbytes) + ' B'
|
return str(nbytes) + ' B'
|
||||||
|
|
||||||
|
|
||||||
def _format_bandwith(volme: float or int, time_us: int):
|
def _format_bandwidth(volme: float or int, time_us: int):
|
||||||
sec_div_mb = (1000.0 / 1024.0)**2
|
sec_div_mb = (1000.0 / 1024.0)**2
|
||||||
mb_per_sec = volme / time_us * sec_div_mb
|
mb_per_sec = volme / time_us * sec_div_mb
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user