diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py index 557ca0d12..5b8b43d4e 100644 --- a/colossalai/inference/modeling/models/nopadding_llama.py +++ b/colossalai/inference/modeling/models/nopadding_llama.py @@ -270,7 +270,7 @@ def llama_rmsnorm_forward( return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual) -class NopadLlamaMLP(ParallelModule, LlamaMLP): +class NopadLlamaMLP(LlamaMLP, ParallelModule): def __init__( self, config: LlamaConfig, @@ -392,7 +392,7 @@ class NopadLlamaMLP(ParallelModule, LlamaMLP): return f"gate_up_proj MergedLinear1D_Col: in_features={self.gate_up_weight.shape[1]}x2, out_features={self.gate_up_weight.shape[2]}, bias=False" -class NopadLlamaAttention(ParallelModule, LlamaAttention): +class NopadLlamaAttention(LlamaAttention, ParallelModule): def __init__( self, config: LlamaConfig, diff --git a/examples/inference/benchmark_ops/benchmark_context_attn_unpad.py b/examples/inference/benchmark_ops/benchmark_context_attn_unpad.py index 498282ba3..18fe76cf0 100644 --- a/examples/inference/benchmark_ops/benchmark_context_attn_unpad.py +++ b/examples/inference/benchmark_ops/benchmark_context_attn_unpad.py @@ -4,7 +4,7 @@ from transformers.modeling_attn_mask_utils import AttentionMaskConverter from colossalai.inference.modeling.layers.attention import PagedAttention from colossalai.kernel.triton import context_attention_unpadded from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import generate_caches_and_block_tables_v2, torch_attn_ref +from tests.test_infer.test_kernels.triton.kernel_utils import generate_caches_and_block_tables_v2, torch_attn_ref try: import triton # noqa diff --git a/examples/inference/benchmark_ops/benchmark_decoding_attn.py b/examples/inference/benchmark_ops/benchmark_decoding_attn.py index 1a80961a7..4471ddada 100644 --- a/examples/inference/benchmark_ops/benchmark_decoding_attn.py +++ b/examples/inference/benchmark_ops/benchmark_decoding_attn.py @@ -2,14 +2,14 @@ import torch from colossalai.kernel.triton import flash_decoding_attention from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v3, torch_attn_ref, ) -from tests.test_infer.test_ops.triton.test_decoding_attn import prepare_data +from tests.test_infer.test_kernels.triton.test_decoding_attn import prepare_data try: import triton # noqa diff --git a/examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py b/examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py index 35eae69b6..d90de6664 100644 --- a/examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py +++ b/examples/inference/benchmark_ops/benchmark_flash_decoding_attention.py @@ -3,7 +3,7 @@ import torch from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.triton import flash_decoding_attention from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v3, generate_caches_and_block_tables_vllm, diff --git a/examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py b/examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py index 6a499ccf2..80939f5a1 100644 --- a/examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py +++ b/examples/inference/benchmark_ops/benchmark_fused_rotary_embdding_unpad.py @@ -2,7 +2,7 @@ import torch from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.triton import copy_kv_to_blocked_cache, decoding_fused_rotary_embedding, rotary_embedding -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( mock_alloc_block_table_and_kvcache_v2, mock_alloc_block_table_and_kvcache_v3, mock_alloc_single_token, diff --git a/examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py b/examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py index 03f797308..0232cb90e 100644 --- a/examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py +++ b/examples/inference/benchmark_ops/benchmark_kv_cache_memcopy.py @@ -4,8 +4,8 @@ from colossalai.inference.modeling.layers.attention import copy_to_cache from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.kernel.triton import copy_kv_to_blocked_cache from colossalai.utils import get_current_device -from tests.test_infer.test_ops.cuda.test_kv_cache_memcpy import prepare_data as prepare_data_new_kcache_layout -from tests.test_infer.test_ops.triton.test_kvcache_copy import prepare_data +from tests.test_infer.test_kernels.cuda.test_kv_cache_memcpy import prepare_data as prepare_data_new_kcache_layout +from tests.test_infer.test_kernels.triton.test_kvcache_copy import prepare_data try: import triton # noqa diff --git a/examples/inference/benchmark_ops/benchmark_xine_copy.py b/examples/inference/benchmark_ops/benchmark_xine_copy.py index b15232b91..633ceb6f1 100644 --- a/examples/inference/benchmark_ops/benchmark_xine_copy.py +++ b/examples/inference/benchmark_ops/benchmark_xine_copy.py @@ -1,7 +1,7 @@ import torch from colossalai.kernel.triton import get_xine_cache -from tests.test_infer.test_ops.triton.test_xine_copy import get_cos_sin +from tests.test_infer.test_kernels.triton.test_xine_copy import get_cos_sin try: import triton # noqa diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py index 046ee932d..cc0389af9 100755 --- a/tests/test_infer/test_config_and_struct.py +++ b/tests/test_infer/test_config_and_struct.py @@ -80,7 +80,7 @@ def check_config_and_inference(): def run_dist(rank, world_size, port): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") check_config_and_inference() diff --git a/tests/test_infer/test_cuda_graph.py b/tests/test_infer/test_cuda_graph.py index a0a55d3ad..4cdc62fbe 100644 --- a/tests/test_infer/test_cuda_graph.py +++ b/tests/test_infer/test_cuda_graph.py @@ -80,7 +80,7 @@ def check_output_consistency(batch_size): def run_dist(rank, world_size, port): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") check_output_consistency(32) check_output_consistency(64) check_output_consistency(128) diff --git a/tests/test_infer/test_inference_engine.py b/tests/test_infer/test_inference_engine.py index 25413a292..a0ddbbc7b 100644 --- a/tests/test_infer/test_inference_engine.py +++ b/tests/test_infer/test_inference_engine.py @@ -157,7 +157,7 @@ def check_spec_dec(num_layers, max_length): def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") if ret: ret[rank] = func_to_run(**kwargs) diff --git a/tests/test_infer/test_ops/__init__.py b/tests/test_infer/test_kernels/__init__.py similarity index 100% rename from tests/test_infer/test_ops/__init__.py rename to tests/test_infer/test_kernels/__init__.py diff --git a/tests/test_infer/test_ops/cuda/__init__.py b/tests/test_infer/test_kernels/cuda/__init__.py similarity index 100% rename from tests/test_infer/test_ops/cuda/__init__.py rename to tests/test_infer/test_kernels/cuda/__init__.py diff --git a/tests/test_infer/test_ops/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py similarity index 98% rename from tests/test_infer/test_ops/cuda/test_flash_decoding_attention.py rename to tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index b3bd503bb..80a5d067b 100644 --- a/tests/test_infer/test_ops/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -7,11 +7,11 @@ import torch from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.test_context_attn_unpad import generate_alibi_mask +from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generate_alibi_mask inference_ops = InferenceOpsLoader().load() -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, generate_caches_and_block_tables_v3, diff --git a/tests/test_infer/test_ops/cuda/test_get_cos_and_sin.py b/tests/test_infer/test_kernels/cuda/test_get_cos_and_sin.py similarity index 95% rename from tests/test_infer/test_ops/cuda/test_get_cos_and_sin.py rename to tests/test_infer/test_kernels/cuda/test_get_cos_and_sin.py index c632cfe30..b6ba1a01b 100644 --- a/tests/test_infer/test_ops/cuda/test_get_cos_and_sin.py +++ b/tests/test_infer/test_kernels/cuda/test_get_cos_and_sin.py @@ -3,7 +3,7 @@ import pytest import torch from colossalai.kernel.kernel_loader import InferenceOpsLoader -from tests.test_infer.test_ops.triton.test_xine_copy import get_cos_sin +from tests.test_infer.test_kernels.triton.test_xine_copy import get_cos_sin inference_ops = InferenceOpsLoader().load() diff --git a/tests/test_infer/test_ops/cuda/test_kv_cache_memcpy.py b/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py similarity index 97% rename from tests/test_infer/test_ops/cuda/test_kv_cache_memcpy.py rename to tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py index e9c99ddc7..d90f64690 100644 --- a/tests/test_infer/test_ops/cuda/test_kv_cache_memcpy.py +++ b/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py @@ -4,7 +4,10 @@ import torch.nn.functional as F from colossalai.kernel.kernel_loader import InferenceOpsLoader from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import generate_caches_and_block_tables_v3, mock_alloc_single_token +from tests.test_infer.test_kernels.triton.kernel_utils import ( + generate_caches_and_block_tables_v3, + mock_alloc_single_token, +) inference_ops = InferenceOpsLoader().load() diff --git a/tests/test_infer/test_ops/cuda/test_rms_layernorm.py b/tests/test_infer/test_kernels/cuda/test_rms_layernorm.py similarity index 100% rename from tests/test_infer/test_ops/cuda/test_rms_layernorm.py rename to tests/test_infer/test_kernels/cuda/test_rms_layernorm.py diff --git a/tests/test_infer/test_ops/cuda/test_rotary_embdding_unpad.py b/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py similarity index 96% rename from tests/test_infer/test_ops/cuda/test_rotary_embdding_unpad.py rename to tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py index 501bf65d8..8237384c0 100644 --- a/tests/test_infer/test_ops/cuda/test_rotary_embdding_unpad.py +++ b/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py @@ -7,8 +7,8 @@ from colossalai.kernel.kernel_loader import InferenceOpsLoader inference_ops = InferenceOpsLoader().load() -from tests.test_infer.test_ops.triton.kernel_utils import mock_alloc_block_table_and_kvcache_v3 -from tests.test_infer.test_ops.triton.test_rotary_embdding_unpad import torch_rotary_emb +from tests.test_infer.test_kernels.triton.kernel_utils import mock_alloc_block_table_and_kvcache_v3 +from tests.test_infer.test_kernels.triton.test_rotary_embdding_unpad import torch_rotary_emb def numpy_allclose(x, y, rtol, atol): diff --git a/tests/test_infer/test_ops/cuda/test_silu_and_mul.py b/tests/test_infer/test_kernels/cuda/test_silu_and_mul.py similarity index 100% rename from tests/test_infer/test_ops/cuda/test_silu_and_mul.py rename to tests/test_infer/test_kernels/cuda/test_silu_and_mul.py diff --git a/tests/test_infer/test_ops/triton/__init__.py b/tests/test_infer/test_kernels/triton/__init__.py similarity index 100% rename from tests/test_infer/test_ops/triton/__init__.py rename to tests/test_infer/test_kernels/triton/__init__.py diff --git a/tests/test_infer/test_ops/triton/kernel_utils.py b/tests/test_infer/test_kernels/triton/kernel_utils.py similarity index 100% rename from tests/test_infer/test_ops/triton/kernel_utils.py rename to tests/test_infer/test_kernels/triton/kernel_utils.py diff --git a/tests/test_infer/test_ops/triton/test_context_attn_unpad.py b/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py similarity index 99% rename from tests/test_infer/test_ops/triton/test_context_attn_unpad.py rename to tests/test_infer/test_kernels/triton/test_context_attn_unpad.py index 76785d530..e34fada97 100644 --- a/tests/test_infer/test_ops/triton/test_context_attn_unpad.py +++ b/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py @@ -5,7 +5,7 @@ from packaging import version from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.kernel.triton import context_attention_unpadded from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v3, torch_attn_ref, diff --git a/tests/test_infer/test_ops/triton/test_decoding_attn.py b/tests/test_infer/test_kernels/triton/test_decoding_attn.py similarity index 97% rename from tests/test_infer/test_ops/triton/test_decoding_attn.py rename to tests/test_infer/test_kernels/triton/test_decoding_attn.py index 616d7868b..24741fecf 100644 --- a/tests/test_infer/test_ops/triton/test_decoding_attn.py +++ b/tests/test_infer/test_kernels/triton/test_decoding_attn.py @@ -6,14 +6,14 @@ from packaging import version from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes from colossalai.kernel.triton import flash_decoding_attention from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v3, torch_attn_ref, ) -from tests.test_infer.test_ops.triton.test_context_attn_unpad import generate_alibi_mask +from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generate_alibi_mask try: import triton # noqa diff --git a/tests/test_infer/test_ops/triton/test_fused_rotary_embedding.py b/tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py similarity index 100% rename from tests/test_infer/test_ops/triton/test_fused_rotary_embedding.py rename to tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py diff --git a/tests/test_infer/test_ops/triton/test_kvcache_copy.py b/tests/test_infer/test_kernels/triton/test_kvcache_copy.py similarity index 99% rename from tests/test_infer/test_ops/triton/test_kvcache_copy.py rename to tests/test_infer/test_kernels/triton/test_kvcache_copy.py index 95126c087..336eb256b 100644 --- a/tests/test_infer/test_ops/triton/test_kvcache_copy.py +++ b/tests/test_infer/test_kernels/triton/test_kvcache_copy.py @@ -4,7 +4,7 @@ from packaging import version from colossalai.kernel.triton import copy_k_to_blocked_cache, copy_kv_to_blocked_cache from colossalai.utils import get_current_device -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( generate_caches_and_block_tables_v2, generate_caches_and_block_tables_v3, mock_alloc_single_token, diff --git a/tests/test_infer/test_ops/triton/test_rmsnorm_triton.py b/tests/test_infer/test_kernels/triton/test_rmsnorm_triton.py similarity index 100% rename from tests/test_infer/test_ops/triton/test_rmsnorm_triton.py rename to tests/test_infer/test_kernels/triton/test_rmsnorm_triton.py diff --git a/tests/test_infer/test_ops/triton/test_rotary_embdding_unpad.py b/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py similarity index 98% rename from tests/test_infer/test_ops/triton/test_rotary_embdding_unpad.py rename to tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py index 87eb38135..570093693 100644 --- a/tests/test_infer/test_ops/triton/test_rotary_embdding_unpad.py +++ b/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py @@ -4,7 +4,7 @@ from packaging import version from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb from colossalai.kernel.triton import decoding_fused_rotary_embedding -from tests.test_infer.test_ops.triton.kernel_utils import ( +from tests.test_infer.test_kernels.triton.kernel_utils import ( mock_alloc_block_table_and_kvcache_v2, mock_alloc_block_table_and_kvcache_v3, ) diff --git a/tests/test_infer/test_ops/triton/test_xine_copy.py b/tests/test_infer/test_kernels/triton/test_xine_copy.py similarity index 100% rename from tests/test_infer/test_ops/triton/test_xine_copy.py rename to tests/test_infer/test_kernels/triton/test_xine_copy.py diff --git a/tests/test_infer/test_kvcache_manager.py b/tests/test_infer/test_kvcache_manager.py index 321047706..bca9a1a84 100755 --- a/tests/test_infer/test_kvcache_manager.py +++ b/tests/test_infer/test_kvcache_manager.py @@ -164,7 +164,7 @@ def check_cache_manager(test_config): def run_dist(rank, world_size, port): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") check_cache_manager() diff --git a/tests/test_infer/test_models/test_baichuan.py b/tests/test_infer/test_models/test_baichuan.py index 5d6be5cb1..3d6fc3bdb 100644 --- a/tests/test_infer/test_models/test_baichuan.py +++ b/tests/test_infer/test_models/test_baichuan.py @@ -14,7 +14,6 @@ from colossalai.inference.core.engine import InferenceEngine from colossalai.inference.modeling.policy import NoPaddingBaichuanModelInferPolicy from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -# BAICHUAN_MODEL_NAME_OR_PATH = "baichuan-inc/Baichuan2-7B-Base" BAICHUAN_MODEL_NAME_OR_PATH = "baichuan-inc/Baichuan2-13B-Base" @@ -87,7 +86,7 @@ def run_engine(world_size, **kwargs): def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") if ret: ret[rank] = func_to_run(**kwargs) @@ -99,7 +98,7 @@ def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs): @parameterize("prompt_template", [None, "baichuan"]) @parameterize("do_sample", [False]) @parameterize("use_cuda_kernel", [True]) -def test_tp_engine(prompt_template, do_sample, use_cuda_kernel): +def check_tp_engine(prompt_template, do_sample, use_cuda_kernel): kwargs1 = { "use_engine": True, "prompt_template": prompt_template, @@ -132,7 +131,7 @@ def test_tp_engine(prompt_template, do_sample, use_cuda_kernel): @pytest.mark.dist @rerun_if_address_is_in_use() def test_inference_engine(): - test_tp_engine() + check_tp_engine() if __name__ == "__main__": diff --git a/tests/test_infer/test_request_handler.py b/tests/test_infer/test_request_handler.py index c7a35ebbe..912fdbf11 100644 --- a/tests/test_infer/test_request_handler.py +++ b/tests/test_infer/test_request_handler.py @@ -90,7 +90,7 @@ def check_request_handler(): def run_dist(rank, world_size, port): - colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost") + colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") check_running_list() check_request_handler()