Optimized the execution interval time between cuda kernels caused by view and memcopy (#5390)

* opt_view_and_memcopy

* fix bugs in ci

* fix ci bugs

* update benchmark scripts

* fix ci bugs
This commit is contained in:
yuehuayingxueluo
2024-02-21 13:23:57 +08:00
committed by GitHub
parent 730103819d
commit 2a718c8be8
8 changed files with 141 additions and 55 deletions

View File

@@ -95,7 +95,7 @@ def benchmark_inference(args):
else:
assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda()
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
model = model.eval()
@@ -122,6 +122,7 @@ def benchmark_inference(args):
elif args.mode == "vllm":
engine = LLM(
model=args.model_path,
tokenizer="hf-internal-testing/llama-tokenizer",
max_num_seqs=mbsz,
dtype="float16",
enforce_eager=True,