diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py index 8d9ec147d..808e4e845 100644 --- a/colossalai/cli/launcher/__init__.py +++ b/colossalai/cli/launcher/__init__.py @@ -28,7 +28,7 @@ from .run import launch_multi_processes type=str, default=None, help= - "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ," + "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include," " only effective when used with --hostfile.") @click.option("--num_nodes", type=int, diff --git a/colossalai/cli/launcher/hostinfo.py b/colossalai/cli/launcher/hostinfo.py index 065cbc371..d1b88b229 100644 --- a/colossalai/cli/launcher/hostinfo.py +++ b/colossalai/cli/launcher/hostinfo.py @@ -38,7 +38,7 @@ class HostInfo: # socket.getfqdn("127.0.0.1") does not return localhost # on some users' machines - # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0 + # thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0 if hostname in ("localhost", "127.0.0.1", "0.0.0.0"): return True diff --git a/colossalai/cli/launcher/multinode_runner.py b/colossalai/cli/launcher/multinode_runner.py index a51e1e371..85b241e96 100644 --- a/colossalai/cli/launcher/multinode_runner.py +++ b/colossalai/cli/launcher/multinode_runner.py @@ -114,7 +114,7 @@ class MultiNodeRunner: Receive messages from all hosts Returns: - msg_from_node (dict): a dictionry which contains messages from each node + msg_from_node (dict): a dictionary which contains messages from each node """ msg_from_node = dict() diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py index 6411b4302..027a10aa8 100644 --- a/colossalai/cli/launcher/run.py +++ b/colossalai/cli/launcher/run.py @@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None: # receive the stop status msg_from_node = runner.recv_from_all() - # printe node status + # print node status click.echo("\n====== Stopping All Nodes =====") for hostname, msg in msg_from_node.items(): click.echo(f"{hostname}: {msg}") diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py index f8b20de9b..f4e6cfffb 100644 --- a/colossalai/device/alpha_beta_profiler.py +++ b/colossalai/device/alpha_beta_profiler.py @@ -197,7 +197,7 @@ class AlphaBetaProfiler: dist.broadcast_object_list(broadcast_list, src=process_group[0]) alpha_beta_dict[process_group] = tuple(broadcast_list) - # add symmetry pair to the apha_beta_dict + # add symmetry pair to the alpha_beta_dict symmetry_ab_dict = {} for process_group, alpha_beta_pair in alpha_beta_dict.items(): symmetry_process_group = (process_group[1], process_group[0]) diff --git a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py index 85f1553e3..591485fdb 100644 --- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py +++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py @@ -51,7 +51,7 @@ class BiasAdditionModule(ABC): For example: The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are - considered during module initilizing. However, we need to consider those attributes as kwargs + considered during module initializing. However, we need to consider those attributes as kwargs in F.conv2d. """ pass diff --git a/colossalai/fx/tracer/experimental.py b/colossalai/fx/tracer/experimental.py index 88b65b618..22a67d1ce 100644 --- a/colossalai/fx/tracer/experimental.py +++ b/colossalai/fx/tracer/experimental.py @@ -295,7 +295,7 @@ class ColoTracer(Tracer): @staticmethod def forward(ctx, run_function, preserve_rng_state, *args): - # signal that the current tracing occurs within activaton checkpoint part + # signal that the current tracing occurs within activation checkpoint part self.inside_torch_checkpoint_func = True out = run_function(*args) self.inside_torch_checkpoint_func = False diff --git a/colossalai/fx/tracer/tracer.py b/colossalai/fx/tracer/tracer.py index 1ae31f958..28965a1b8 100644 --- a/colossalai/fx/tracer/tracer.py +++ b/colossalai/fx/tracer/tracer.py @@ -92,7 +92,7 @@ class ColoTracer(Tracer): return proxy # if graph is traced for auto parallelism module, some extra node will be added during - # graph construction to deal with the compatability between bias addition and all reduce. + # graph construction to deal with the compatibility between bias addition and all reduce. # if no extra manipulation is applied, we just pass the origin arguments to create_proxy function # to create node on computation graph @@ -208,7 +208,7 @@ class ColoTracer(Tracer): self.proxy_cls = ColoProxy self.tracer_type = TracerType.META else: - raise ValueError(f"Unrecognised tracer type {tracer_type}") + raise ValueError(f"Unrecognized tracer type {tracer_type}") def _meta_data_computing(self, kind, target, args, kwargs): @@ -445,7 +445,7 @@ class ColoTracer(Tracer): @staticmethod def forward(ctx, run_function, preserve_rng_state, *args): - # signal that the current tracing occurs within activaton checkpoint part + # signal that the current tracing occurs within activation checkpoint part self.inside_torch_checkpoint_func = True out = run_function(*args) self.inside_torch_checkpoint_func = False diff --git a/colossalai/kernel/cuda_native/flash_attention.py b/colossalai/kernel/cuda_native/flash_attention.py index d793815ed..3db737450 100644 --- a/colossalai/kernel/cuda_native/flash_attention.py +++ b/colossalai/kernel/cuda_native/flash_attention.py @@ -138,7 +138,7 @@ if HAS_MEM_EFF_ATTN: elif attn_mask_type == AttnMaskType.causal: # gpt style attn_bias = LowerTriangularMask() - if bias is not None: # alibi / relative position emebedding + if bias is not None: # alibi / relative position embedding assert allow_alibi, "flash attention with bias is not supported in this system." assert attn_mask_type == AttnMaskType.causal, \ "attention with bias is only supported for causal attention so far." diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py index 3b6470cdc..69246f2f3 100644 --- a/colossalai/kernel/cuda_native/multihead_attention.py +++ b/colossalai/kernel/cuda_native/multihead_attention.py @@ -43,7 +43,7 @@ class Config: attn_prob_dropout_ratio: float # attention score dropout ratio hidden_dropout_ratio: float # dropout ration before residual norm_first: bool # norm_first - fp16: bool # fp16 presion + fp16: bool # fp16 precision class MultiHeadAttention1DFunc(Function): diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py index aa41f5767..e20c08b05 100644 --- a/colossalai/kernel/jit/option.py +++ b/colossalai/kernel/jit/option.py @@ -43,7 +43,7 @@ def warmup_jit_fusion(batch_size: int, seq_length: int = 512, vocab_size: int = 32768, dtype: torch.dtype = torch.float32): - """ Compilie JIT functions before the main training steps """ + """ Compile JIT functions before the main training steps """ embed = Embedding(vocab_size, hidden_size).to(get_current_device()) linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_current_device())