mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-06 11:32:10 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import time
|
||||
from typing import Any, Dict, List
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.fx
|
||||
@@ -64,8 +64,10 @@ def _benchmark_autochunk_gpt_gm(
|
||||
para_mem = float(parameter_size(model)) / 1024**2 * 6
|
||||
act_mem = _benchmark_memory(gm, inputs)
|
||||
speed = _benchmark_speed(gm, inputs)
|
||||
print("gpt autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
|
||||
(speed, act_mem, para_mem, act_mem + para_mem))
|
||||
print(
|
||||
"gpt autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB"
|
||||
% (speed, act_mem, para_mem, act_mem + para_mem)
|
||||
)
|
||||
|
||||
|
||||
def _benchmark_autochunk_gpt_origin(
|
||||
@@ -86,8 +88,10 @@ def _benchmark_autochunk_gpt_origin(
|
||||
para_mem = float(parameter_size(model)) / 1024**2 * 6
|
||||
act_mem = _benchmark_memory(model, inputs)
|
||||
speed = _benchmark_speed(model, inputs)
|
||||
print("gpt origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB" %
|
||||
(speed, act_mem, para_mem, act_mem + para_mem))
|
||||
print(
|
||||
"gpt origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB"
|
||||
% (speed, act_mem, para_mem, act_mem + para_mem)
|
||||
)
|
||||
return act_mem
|
||||
|
||||
|
||||
@@ -115,6 +119,7 @@ def _benchmark_speed(model, inputs, loop=5):
|
||||
|
||||
def benchmark_autochunk_gpt(batch=1, seq=512, n_embd=768, n_head=12):
|
||||
from test_autochunk_gpt import GPT2Config, GPT2Model, get_data
|
||||
|
||||
model = GPT2Model
|
||||
config = GPT2Config(n_embd=n_embd, n_positions=seq, n_layer=2, n_head=n_head)
|
||||
model = model(config=config)
|
||||
@@ -125,7 +130,7 @@ def benchmark_autochunk_gpt(batch=1, seq=512, n_embd=768, n_head=12):
|
||||
try:
|
||||
_benchmark_autochunk_gpt_gm(model, get_data(shape), max_mem * ratio)
|
||||
except RuntimeError as e:
|
||||
if e.args[0] == 'Search failed. Try a larger memory threshold.':
|
||||
if e.args[0] == "Search failed. Try a larger memory threshold.":
|
||||
break
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
@@ -5,6 +5,7 @@ import torch
|
||||
|
||||
try:
|
||||
from transformers import GPT2Config, GPT2Model
|
||||
|
||||
MODELS = [GPT2Model]
|
||||
HAS_REPO = True
|
||||
except:
|
||||
@@ -52,13 +53,15 @@ def test_autochunk_gpt(model, shape, max_memory):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_test(rank=0,
|
||||
data=get_data((BATCH_SIZE, SEQ_LENGTH)),
|
||||
max_memory=None,
|
||||
model=GPT2Model,
|
||||
config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
|
||||
print_code=False,
|
||||
print_est_mem=False,
|
||||
print_mem=False,
|
||||
print_progress=False,
|
||||
eval_mem=False)
|
||||
run_test(
|
||||
rank=0,
|
||||
data=get_data((BATCH_SIZE, SEQ_LENGTH)),
|
||||
max_memory=None,
|
||||
model=GPT2Model,
|
||||
config=GPT2Config(n_embd=96, n_position=SEQ_LENGTH, n_layer=2, n_head=4),
|
||||
print_code=False,
|
||||
print_est_mem=False,
|
||||
print_mem=False,
|
||||
print_progress=False,
|
||||
eval_mem=False,
|
||||
)
|
||||
|
@@ -38,11 +38,9 @@ def assert_codegen_run(
|
||||
meta_tensors = [meta_args[i] if i in meta_args else concrete_args[i] for i in sequence]
|
||||
meta_tensors = [MetaTensor(i, fake_device="cuda:0") if isinstance(i, torch.Tensor) else i for i in meta_tensors]
|
||||
interp.propagate(*meta_tensors)
|
||||
codegen = AutoChunkCodeGen(meta_graph,
|
||||
max_memory=max_memory,
|
||||
print_mem=print_est_mem,
|
||||
print_progress=print_progress,
|
||||
eval_mem=eval_mem)
|
||||
codegen = AutoChunkCodeGen(
|
||||
meta_graph, max_memory=max_memory, print_mem=print_est_mem, print_progress=print_progress, eval_mem=eval_mem
|
||||
)
|
||||
chunks = codegen.chunk_infos
|
||||
|
||||
# trace and recompile
|
||||
@@ -85,9 +83,9 @@ def assert_allclose(out_model: Any, out_gm: Any) -> None:
|
||||
assert allclose for out
|
||||
"""
|
||||
if isinstance(out_model, torch.Tensor):
|
||||
assert torch.allclose(out_model, out_gm,
|
||||
atol=1e-4), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(
|
||||
torch.abs(out_model - out_gm))
|
||||
assert torch.allclose(
|
||||
out_model, out_gm, atol=1e-4
|
||||
), "fx_out doesn't comply with original output, diff is %.2e" % torch.mean(torch.abs(out_model - out_gm))
|
||||
elif isinstance(out_model, dict):
|
||||
for k in out_model.keys():
|
||||
assert_allclose(out_model[k], out_gm[k])
|
||||
@@ -123,19 +121,21 @@ def run_test(
|
||||
)
|
||||
|
||||
# build model and input
|
||||
chunks = assert_codegen_run(model,
|
||||
data=data,
|
||||
max_memory=max_memory,
|
||||
print_code=print_code,
|
||||
print_est_mem=print_est_mem,
|
||||
print_mem=print_mem,
|
||||
print_progress=print_progress,
|
||||
eval_mem=eval_mem)
|
||||
chunks = assert_codegen_run(
|
||||
model,
|
||||
data=data,
|
||||
max_memory=max_memory,
|
||||
print_code=print_code,
|
||||
print_est_mem=print_est_mem,
|
||||
print_mem=print_mem,
|
||||
print_progress=print_progress,
|
||||
eval_mem=eval_mem,
|
||||
)
|
||||
|
||||
if get_chunk_target is not None:
|
||||
chunk_found = [i["region"] for i in chunks]
|
||||
chunk_target = get_chunk_target()[max_memory]
|
||||
assert (chunk_found == chunk_target), "found regions %s doesn't equal target regions %s" % (
|
||||
assert chunk_found == chunk_target, "found regions %s doesn't equal target regions %s" % (
|
||||
str(chunk_found),
|
||||
str(chunk_target),
|
||||
)
|
||||
|
Reference in New Issue
Block a user