diff --git a/colossalai/shardformer/modeling/chatglm2_6b/__init__.py b/colossalai/shardformer/modeling/chatglm2_6b/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/inference/benchmark.py b/examples/inference/benchmark.py index 85c5aac59..a20983fd1 100644 --- a/examples/inference/benchmark.py +++ b/examples/inference/benchmark.py @@ -95,11 +95,27 @@ def print_details_info(timestamps, model_config, args, whole_end2end): def benchmark_inference(args): if args.model == "toy": - model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8)) + model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=4)) elif args.model == "7b": - model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-7b-hf")) + model = transformers.LlamaForCausalLM( + transformers.LlamaConfig( + hidden_size=4096, + intermediate_size=11008, + num_attention_heads=32, + num_hidden_layers=32, + num_key_value_heads=32, + ) + ) elif args.model == "13b": - model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-13b-hf")) + model = transformers.LlamaForCausalLM( + transformers.LlamaConfig( + hidden_size=5120, + intermediate_size=13824, + num_attention_heads=40, + num_hidden_layers=40, + num_key_value_heads=40, + ) + ) else: raise NotImplementedError diff --git a/examples/inference/run_benchmark.sh b/examples/inference/run_benchmark.sh index be9f399e0..79008c7d0 100644 --- a/examples/inference/run_benchmark.sh +++ b/examples/inference/run_benchmark.sh @@ -1,6 +1,16 @@ script_dir=$(cd "$(dirname "$0")" && pwd) cd "${script_dir}" +# toy model, 2tp*2pp 1024, 128 +python ./benchmark.py \ + --model="toy" \ + --dtype="fp16" \ + --batch_size=2 \ + --seq_len=1024 \ + --output_len=128 \ + --mb_size=1 \ + --pp_size=2 \ + --tp_size=2 # 7b, fp16, 2 gpu, 1024, 128 for BATCH_SIZE in 2 4 8 16; do @@ -9,7 +19,7 @@ for BATCH_SIZE in 2 4 8 16; do --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=1024 \ - --new_length=128 \ + --output_len=128 \ --mb_size=$((${BATCH_SIZE}/2)) \ --pp_size=2 \ --tp_size=2 @@ -22,7 +32,7 @@ for BATCH_SIZE in 2 4 8 16 32; do --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=512 \ - --new_length=512 \ + --output_len=512 \ --mb_size=$((${BATCH_SIZE}/2)) \ --pp_size=2 \ --tp_size=2 @@ -35,7 +45,7 @@ for BATCH_SIZE in 2 4 8; do --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=1024 \ - --new_length=128 \ + --output_len=128 \ --mb_size=$((${BATCH_SIZE}/2)) \ --pp_size=2 \ --tp_size=2 @@ -48,7 +58,7 @@ for BATCH_SIZE in 2 4 8 16; do --dtype="fp16" \ --batch_size=${BATCH_SIZE} \ --seq_len=512 \ - --new_length=512 \ + --output_len=512 \ --mb_size=$((${BATCH_SIZE}/2)) \ --pp_size=2 \ --tp_size=2