diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md index c4ff2f522..33903f426 100644 --- a/colossalai/inference/README.md +++ b/colossalai/inference/README.md @@ -94,6 +94,7 @@ inference_config = InferenceConfig( max_batch_size=4, max_input_len=1024, max_output_len=512, + use_cuda_kernel=True, use_cuda_graph=False, # Turn on if you want to use CUDA Graph to accelerate inference ) diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py index b3d2bc7bd..6b7c99300 100644 --- a/colossalai/inference/core/engine.py +++ b/colossalai/inference/core/engine.py @@ -389,6 +389,7 @@ class InferenceEngine: fd_inter_tensor=batch.fd_inter_tensor, batch_size=batch.current_batch_size, is_prompts=batch.is_prompts, + use_cuda_kernel=self.inference_config.use_cuda_kernel, use_cuda_graph=use_cuda_graph, kv_seq_len=sequence_lengths.max().item(), head_dim=batch.head_dim,