mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-08-09 11:58:06 +00:00
[example] update gpt readme with performance (#2206)
This commit is contained in:
parent
1cb532ffec
commit
29868a9ec1
@ -53,3 +53,36 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
|
|||||||
- ZeRO2 (Colossal-AI)
|
- ZeRO2 (Colossal-AI)
|
||||||
- Pytorch DDP
|
- Pytorch DDP
|
||||||
- Pytorch ZeRO
|
- Pytorch ZeRO
|
||||||
|
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e.
|
||||||
|
ColossalAI version 0.1.13.
|
||||||
|
|
||||||
|
How dose Batch Size affect the efficency.
|
||||||
|
|
||||||
|
| model | #GPU | policy | TP |batch | Tflops |
|
||||||
|
| ---------- | --------- |--------- |--------- |--------- |--------- |
|
||||||
|
| gpt2_10b | 2 | cpu | 1 | 32 | 122.046 |
|
||||||
|
| gpt2_10b | 2 | cpu | 1 | 16 | 82.649 |
|
||||||
|
| gpt2_10b | 2 | cpu | 1 | 8 | 61.354 |
|
||||||
|
|
||||||
|
|
||||||
|
How dose the Placement Policy affect the efficency.
|
||||||
|
|
||||||
|
| model | #GPU | policy | TP |batch | Tflops |
|
||||||
|
| ---------- | --------- |--------- |--------- |--------- |--------- |
|
||||||
|
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
|
||||||
|
| gpt2_10b | 4 | cuda | 1 | 8 | OOM |
|
||||||
|
| gpt2_10b | 4 | cpu | 1 | 8 | 61.354 |
|
||||||
|
| gpt2_10b | 4 | const | 1 | 8 | 82.137 |
|
||||||
|
|
||||||
|
How dose the Tensor Parallel Degree affect the efficency.
|
||||||
|
|
||||||
|
| model | #GPU | policy | TP |batch | Tflops |
|
||||||
|
| ---------- | --------- |--------- |--------- |--------- |--------- |
|
||||||
|
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
|
||||||
|
| gpt2_10b | 4 | auto | 2 | 8 | 56.687 |
|
||||||
|
| gpt2_10b | 4 | auto | 4 | 8 | 29.019 |
|
||||||
|
| gpt2_10b | 4 | auto | 4 | 64 | 50.411 |
|
||||||
|
@ -2,9 +2,9 @@
|
|||||||
export DISTPAN="colossalai"
|
export DISTPAN="colossalai"
|
||||||
|
|
||||||
# The following options only valid when DISTPAN="colossalai"
|
# The following options only valid when DISTPAN="colossalai"
|
||||||
export TPDEGREE=2
|
export TPDEGREE=4
|
||||||
export GPUNUM=4
|
export GPUNUM=4
|
||||||
export PLACEMENT='cpu'
|
export PLACEMENT='auto'
|
||||||
export USE_SHARD_INIT=False
|
export USE_SHARD_INIT=False
|
||||||
|
|
||||||
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
|
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
|
||||||
|
@ -179,13 +179,17 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
|
|||||||
# Gemini + ZeRO DDP
|
# Gemini + ZeRO DDP
|
||||||
def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
|
def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
|
||||||
cai_version = colossalai.__version__
|
cai_version = colossalai.__version__
|
||||||
|
from colossalai.gemini import ChunkManager, GeminiManager
|
||||||
if version.parse(cai_version) > version.parse("0.1.10"):
|
if version.parse(cai_version) > version.parse("0.1.10"):
|
||||||
from colossalai.nn.parallel import GeminiDDP
|
from colossalai.nn.parallel import GeminiDDP
|
||||||
model = GeminiDDP(model,
|
model = GeminiDDP(model,
|
||||||
device=get_current_device(),
|
device=get_current_device(),
|
||||||
placement_policy=placememt_policy,
|
placement_policy=placememt_policy,
|
||||||
pin_memory=True,
|
pin_memory=True,
|
||||||
search_range_mb=32)
|
hidden_dim=4096,
|
||||||
|
search_range_mb=64)
|
||||||
|
if placememt_policy == 'const':
|
||||||
|
model.gemini_manager._placement_policy.set_const_memory_boundary(10 * 1024)
|
||||||
elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
|
elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
|
||||||
from colossalai.gemini import ChunkManager, GeminiManager
|
from colossalai.gemini import ChunkManager, GeminiManager
|
||||||
chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
|
chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
|
||||||
@ -206,9 +210,10 @@ def main():
|
|||||||
if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
|
if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
|
||||||
raise TypeError(f"{args.distplan} is error")
|
raise TypeError(f"{args.distplan} is error")
|
||||||
|
|
||||||
BATCH_SIZE = 8
|
BATCH_SIZE = 64
|
||||||
SEQ_LEN = 1024
|
SEQ_LEN = 1024
|
||||||
VOCAB_SIZE = 50257
|
VOCAB_SIZE = 50257
|
||||||
|
|
||||||
NUM_STEPS = 10
|
NUM_STEPS = 10
|
||||||
|
|
||||||
disable_existing_loggers()
|
disable_existing_loggers()
|
||||||
@ -227,22 +232,21 @@ def main():
|
|||||||
default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
|
default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
|
||||||
|
|
||||||
# build GPT model
|
# build GPT model
|
||||||
with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
|
with ColoInitContext(device=get_current_device(), default_dist_spec=default_dist_spec, default_pg=default_pg):
|
||||||
model = gpt2_medium(checkpoint=True)
|
model = gpt2_10b(checkpoint=True)
|
||||||
|
|
||||||
pg = default_pg
|
pg = default_pg
|
||||||
# Tensor Parallelism (TP)
|
# Tensor Parallelism (TP)
|
||||||
tensor_parallelize(model, pg)
|
tensor_parallelize(model, pg)
|
||||||
|
|
||||||
# Gemini + ZeRO DP, Note it must be used after TP
|
# Gemini + ZeRO DP, Note it must be used after TP
|
||||||
model = gemini_zero_dpp(model, pg, args.placement)
|
model = gemini_zero_dpp(model, pg, args.placement)
|
||||||
|
|
||||||
# build optimizer
|
# build highly optimized cpu optimizer
|
||||||
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
|
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
|
||||||
# optimizer = HybridAdam(model.parameters(), lr=1e-3)
|
|
||||||
# optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**5)
|
|
||||||
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
||||||
else:
|
else:
|
||||||
model = gpt2_medium(checkpoint=True).cuda()
|
model = gpt2_10b(checkpoint=True).cuda()
|
||||||
|
|
||||||
if args.distplan.startswith("torch"):
|
if args.distplan.startswith("torch"):
|
||||||
model = DDP(model)
|
model = DDP(model)
|
||||||
|
Loading…
Reference in New Issue
Block a user