mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
[Inference]Fused kv copy into rotary calculation (#5383)
* revise rotary embedding * remove useless print * adapt * fix * add * fix * modeling * fix * fix * fix * fused kv copy * fused copy * colossalai/kernel/triton/no_pad_rotary_embedding.py * del padding llama * del
This commit is contained in:
@@ -204,7 +204,7 @@ def benchmark_inference(args):
|
||||
torch.cuda.cudart().cudaProfilerStop()
|
||||
if args.profile:
|
||||
ctx.step()
|
||||
|
||||
print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}")
|
||||
print_details_info(model.config, args, whole_end2end, total_token_num)
|
||||
|
||||
|
||||
|
@@ -1,7 +1,8 @@
|
||||
ROOT=$(realpath $(dirname $0))
|
||||
echo $ROOT
|
||||
PY_SCRIPT=${ROOT}/benchmark_llama.py
|
||||
GPU=$(nvidia-smi -L | head -1 | cut -d' ' -f4 | cut -d'-' -f1)
|
||||
mode=$1
|
||||
mode="colossalai"
|
||||
|
||||
mkdir -p logs
|
||||
|
||||
@@ -23,10 +24,10 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage() {
|
||||
CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1
|
||||
|
||||
# benchmark llama2-7b one single GPU
|
||||
for input_len in 128 512 1024; do
|
||||
for input_len in 128 512 1024; do
|
||||
for output_len in 128 256; do
|
||||
for bsz in 16 32 64; do
|
||||
python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
|
||||
python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --model_path "/home/caidi/llama_model/" | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
|
||||
done
|
||||
done
|
||||
done
|
||||
|
Reference in New Issue
Block a user