[Inference]Fused kv copy into rotary calculation (#5383)

* revise rotary embedding * remove useless print * adapt * fix * add * fix * modeling * fix * fix * fix * fused kv copy * fused copy * colossalai/kernel/triton/no_pad_rotary_embedding.py * del padding llama * del
2025-09-01 17:17:05 +00:00 · 2024-02-21 11:31:48 +08:00
parent b21aac5bae
commit 730103819d
8 changed files with 391 additions and 498 deletions
--- a/examples/inference/benchmark_llama.py
+++ b/examples/inference/benchmark_llama.py
@@ -204,7 +204,7 @@ def benchmark_inference(args):
                torch.cuda.cudart().cudaProfilerStop()
            if args.profile:
                ctx.step()
-
+    print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}")
    print_details_info(model.config, args, whole_end2end, total_token_num)


--- a/examples/inference/run_benchmark.sh
+++ b/examples/inference/run_benchmark.sh
@@ -1,7 +1,8 @@
 ROOT=$(realpath $(dirname $0))
+echo $ROOT
 PY_SCRIPT=${ROOT}/benchmark_llama.py
 GPU=$(nvidia-smi -L | head -1 | cut -d' ' -f4 | cut -d'-' -f1)
-mode=$1
+mode="colossalai"

 mkdir -p logs

@@ -23,10 +24,10 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage() {
 CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1

 # benchmark llama2-7b one single GPU
-for input_len in 128 512 1024; do
+for input_len in  128 512 1024; do
    for output_len in 128 256; do
        for bsz in 16 32 64; do
-            python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
+            python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --model_path "/home/caidi/llama_model/" | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
        done
    done
 done