mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 10:06:44 +00:00
@@ -18,11 +18,11 @@ from transformers import (
|
||||
)
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
# ==============================
|
||||
# Prepare Hyperparameters
|
||||
@@ -59,7 +59,7 @@ def evaluate_model(
|
||||
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
|
||||
is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)
|
||||
|
||||
accum_loss = torch.zeros(1, device=get_current_device())
|
||||
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
|
||||
for batch in dataloader:
|
||||
batch = move_to_cuda(batch)
|
||||
labels = batch["labels"]
|
||||
@@ -89,8 +89,10 @@ def evaluate_model(
|
||||
object_list = [None, None]
|
||||
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
|
||||
|
||||
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels)
|
||||
accum_loss.add_(object_list[1].to(get_current_device()))
|
||||
metric.add_batch(
|
||||
predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
|
||||
)
|
||||
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
|
||||
|
||||
else:
|
||||
batch = move_to_cuda(batch)
|
||||
|
@@ -7,13 +7,13 @@ from model_zoo import GPTLMLoss, get_gpt2_components
|
||||
from torch.utils._pytree import tree_map
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
|
||||
from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
|
||||
from colossalai.auto_parallel.offload.solver import NOT_NVML
|
||||
from colossalai.fx.profiler import parameter_size
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.testing import spawn
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def parse_args():
|
||||
@@ -41,7 +41,7 @@ def train_gpt(args):
|
||||
64,
|
||||
8,
|
||||
),
|
||||
device=get_current_device(),
|
||||
device=get_accelerator().get_current_device(),
|
||||
)
|
||||
criterion = GPTLMLoss()
|
||||
|
||||
|
@@ -12,12 +12,12 @@ from commons.utils import get_data, get_profile_context, get_tflops, get_time_st
|
||||
from packaging import version
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
|
||||
from colossalai.lazy import LazyInitContext
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
CAI_VERSION = colossalai.__version__
|
||||
|
||||
@@ -141,7 +141,11 @@ def main():
|
||||
criterion = GPTLMLoss()
|
||||
torch.manual_seed(123)
|
||||
if args.distplan.startswith("CAI"):
|
||||
ctx = LazyInitContext(default_device=get_current_device()) if args.distplan == "CAI_Gemini" else nullcontext()
|
||||
ctx = (
|
||||
LazyInitContext(default_device=get_accelerator().get_current_device())
|
||||
if args.distplan == "CAI_Gemini"
|
||||
else nullcontext()
|
||||
)
|
||||
# build GPT model
|
||||
with ctx:
|
||||
model = model_builder(args.model_type)(checkpoint=True)
|
||||
|
@@ -13,11 +13,11 @@ from tqdm import tqdm
|
||||
from transformers import AutoConfig, GPT2ForSequenceClassification, get_linear_schedule_with_warmup
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
# ==============================
|
||||
# Prepare Hyperparameters
|
||||
@@ -54,7 +54,7 @@ def evaluate_model(
|
||||
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
|
||||
is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
|
||||
|
||||
accum_loss = torch.zeros(1, device=get_current_device())
|
||||
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
|
||||
for batch in dataloader:
|
||||
batch = move_to_cuda(batch)
|
||||
labels = batch["labels"]
|
||||
@@ -83,8 +83,10 @@ def evaluate_model(
|
||||
object_list = [None, None]
|
||||
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
|
||||
|
||||
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels)
|
||||
accum_loss.add_(object_list[1].to(get_current_device()))
|
||||
metric.add_batch(
|
||||
predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
|
||||
)
|
||||
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
|
||||
|
||||
else:
|
||||
batch = move_to_cuda(batch)
|
||||
|
@@ -5,6 +5,7 @@ from torch import nn as nn
|
||||
from torch.nn import functional as F
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.legacy.context import ParallelMode, seed
|
||||
from colossalai.legacy.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
|
||||
@@ -12,7 +13,6 @@ from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_b
|
||||
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
|
||||
from colossalai.legacy.nn.layer.utils import divide
|
||||
from colossalai.legacy.registry import LAYERS, LOSSES
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
class VocabParallelEmbedding(torch.nn.Module):
|
||||
@@ -96,7 +96,9 @@ class VocabParallelEmbedding(torch.nn.Module):
|
||||
if position_ids is not None:
|
||||
position_ids = position_ids.view(-1, input_shape[-1])
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
|
||||
position_ids = torch.arange(
|
||||
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
|
||||
)
|
||||
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
|
||||
@@ -194,7 +196,7 @@ class VocabParallelEmbedding1D(torch.nn.Module):
|
||||
self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
|
||||
|
||||
# Allocate weights and initialize.
|
||||
factory_kwargs = {"device": get_current_device(), "dtype": dtype}
|
||||
factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
|
||||
self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
|
||||
init.uniform_(self.weight, -1, 1)
|
||||
|
||||
@@ -439,7 +441,9 @@ class HiddenParallelEmbedding(torch.nn.Module):
|
||||
if position_ids is not None:
|
||||
position_ids = position_ids.view(-1, input_shape[-1])
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
|
||||
position_ids = torch.arange(
|
||||
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
|
||||
)
|
||||
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
|
||||
@@ -532,7 +536,7 @@ class HiddenParallelEmbedding1D(torch.nn.Module):
|
||||
self._weight = None
|
||||
|
||||
# Allocate weights and initialize.
|
||||
factory_kwargs = {"device": get_current_device(), "dtype": dtype}
|
||||
factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
|
||||
self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
|
||||
init.uniform_(self.weight, -1, 1)
|
||||
|
||||
|
@@ -13,13 +13,12 @@ from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
from transformers.models.llama.modeling_llama import LlamaForCausalLM
|
||||
|
||||
import colossalai
|
||||
import colossalai.utils.device as device_utils
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.lazy import LazyInitContext
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
# ==============================
|
||||
# Constants
|
||||
@@ -177,7 +176,7 @@ def main():
|
||||
# Initialize Model and Optimizer
|
||||
# ==============================
|
||||
init_ctx = (
|
||||
LazyInitContext(default_device=get_current_device())
|
||||
LazyInitContext(default_device=get_accelerator().get_current_device())
|
||||
if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
|
||||
else nullcontext()
|
||||
)
|
||||
@@ -208,7 +207,9 @@ def main():
|
||||
torch.set_default_dtype(torch.bfloat16)
|
||||
model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
|
||||
torch.set_default_dtype(torch.float)
|
||||
coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
|
||||
coordinator.print_on_master(
|
||||
f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
|
||||
)
|
||||
coordinator.print_on_master(
|
||||
f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
|
||||
)
|
||||
@@ -234,7 +235,7 @@ def main():
|
||||
performance_evaluator.on_step_end(**batch)
|
||||
|
||||
performance_evaluator.on_fit_end()
|
||||
coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
|
||||
coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@@ -8,7 +8,7 @@ from torch.distributed import ProcessGroup
|
||||
from torch.distributed.distributed_c10d import _get_default_group
|
||||
from torch.utils.data import DataLoader, Dataset, DistributedSampler
|
||||
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.accelerator import get_accelerator
|
||||
|
||||
|
||||
class StatefulDistributedSampler(DistributedSampler):
|
||||
@@ -108,7 +108,9 @@ class RandomDataset(Dataset):
|
||||
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
|
||||
self.num_samples = num_samples
|
||||
self.max_length = max_length
|
||||
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
|
||||
self.input_ids = torch.randint(
|
||||
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
|
||||
)
|
||||
self.attention_mask = torch.ones_like(self.input_ids)
|
||||
|
||||
def __len__(self):
|
||||
|
@@ -21,13 +21,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
|
||||
from transformers.models.llama.tokenization_llama import LlamaTokenizer
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.lazy import LazyInitContext
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def get_model_numel(model: nn.Module) -> int:
|
||||
@@ -191,7 +191,9 @@ def main():
|
||||
config = LlamaConfig.from_pretrained(args.model_path)
|
||||
# use lazy init when using GeminiPlugin
|
||||
init_ctx = (
|
||||
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext()
|
||||
LazyInitContext(default_device=get_accelerator().get_current_device())
|
||||
if isinstance(plugin, GeminiPlugin)
|
||||
else nullcontext()
|
||||
)
|
||||
|
||||
with init_ctx:
|
||||
|
@@ -5,9 +5,8 @@ import torch
|
||||
import torch.distributed as dist
|
||||
from torch import Tensor
|
||||
|
||||
import colossalai.utils.device as device_utils
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.utils.device import get_current_device
|
||||
|
||||
|
||||
def divide(x: float, y: float) -> float:
|
||||
@@ -22,7 +21,7 @@ def divide(x: float, y: float) -> float:
|
||||
def all_reduce_mean(x: float, world_size: int) -> float:
|
||||
if world_size == 1:
|
||||
return x
|
||||
tensor = torch.tensor([x], device=get_current_device())
|
||||
tensor = torch.tensor([x], device=get_accelerator().get_current_device())
|
||||
dist.all_reduce(tensor)
|
||||
tensor = tensor / world_size
|
||||
return tensor.item()
|
||||
@@ -86,13 +85,13 @@ class PerformanceEvaluator:
|
||||
self.disable = self.ignore_steps > 0 and step < self.ignore_steps
|
||||
if self.disable:
|
||||
return
|
||||
device_utils.synchronize()
|
||||
get_accelerator().synchronize()
|
||||
self.timer.start()
|
||||
|
||||
def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
|
||||
if self.disable:
|
||||
return
|
||||
device_utils.synchronize()
|
||||
get_accelerator().synchronize()
|
||||
self.timer.end()
|
||||
|
||||
batch_size, seq_len = input_ids.shape
|
||||
|
@@ -20,13 +20,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
|
||||
from transformers.models.llama.tokenization_llama import LlamaTokenizer
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.lazy import LazyInitContext
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
MODEL_CONFIGS = {
|
||||
"7b": LlamaConfig(max_position_embeddings=4096),
|
||||
@@ -227,7 +227,9 @@ def main():
|
||||
config = MODEL_CONFIGS[args.config]
|
||||
# use lazy init when using GeminiPlugin
|
||||
init_ctx = (
|
||||
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext()
|
||||
LazyInitContext(default_device=get_accelerator().get_current_device())
|
||||
if isinstance(plugin, GeminiPlugin)
|
||||
else nullcontext()
|
||||
)
|
||||
|
||||
with init_ctx:
|
||||
|
@@ -14,6 +14,7 @@ from transformers.models.llama import LlamaConfig
|
||||
from utils import PerformanceEvaluator, get_model_numel
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
@@ -21,7 +22,6 @@ from colossalai.moe.layers import apply_load_balance
|
||||
from colossalai.moe.manager import MOE_MANAGER
|
||||
from colossalai.moe.utils import skip_init
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def move_to_cuda(batch, device):
|
||||
@@ -64,13 +64,15 @@ class RandomDataset(Dataset):
|
||||
)
|
||||
self.input_ids.append(encode["input_ids"])
|
||||
self.attention_mask.append(encode["attention_mask"])
|
||||
self.input_ids = torch.cat(self.input_ids, dim=0).to(get_current_device())
|
||||
self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_current_device())
|
||||
self.input_ids = torch.cat(self.input_ids, dim=0).to(get_accelerator().get_current_device())
|
||||
self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_accelerator().get_current_device())
|
||||
repeat_times = num_samples // self.input_ids.shape[0] + 1
|
||||
self.input_ids = self.input_ids.repeat(repeat_times, 1)[:num_samples]
|
||||
self.attention_mask = self.attention_mask.repeat(repeat_times, 1)[:num_samples]
|
||||
else:
|
||||
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
|
||||
self.input_ids = torch.randint(
|
||||
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
|
||||
)
|
||||
self.attention_mask = torch.ones_like(self.input_ids)
|
||||
|
||||
def __len__(self):
|
||||
|
@@ -35,7 +35,7 @@ from transformers.utils import (
|
||||
replace_return_docstrings,
|
||||
)
|
||||
|
||||
from colossalai.kernel.cuda_native.mha.flash_attn_2 import HAS_FLASH_ATTN
|
||||
from colossalai.kernel.extensions.flash_attention import HAS_FLASH_ATTN
|
||||
from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
|
||||
from colossalai.moe.layers import SparseMLP
|
||||
from colossalai.moe.manager import MOE_MANAGER
|
||||
|
@@ -15,6 +15,7 @@ from transformers import T5Tokenizer
|
||||
from transformers.models.llama import LlamaConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
|
||||
from colossalai.cluster import DistCoordinator
|
||||
@@ -22,7 +23,6 @@ from colossalai.moe.layers import apply_load_balance
|
||||
from colossalai.moe.manager import MOE_MANAGER
|
||||
from colossalai.moe.utils import skip_init
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def move_to_cuda(batch, device):
|
||||
@@ -61,7 +61,9 @@ class RandomDataset(Dataset):
|
||||
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000, tokenizer=None):
|
||||
self.num_samples = num_samples
|
||||
self.max_length = max_length
|
||||
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
|
||||
self.input_ids = torch.randint(
|
||||
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
|
||||
)
|
||||
self.attention_mask = torch.ones_like(self.input_ids)
|
||||
|
||||
def __len__(self):
|
||||
|
@@ -14,12 +14,12 @@ from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
import colossalai
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.booster import Booster
|
||||
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
|
||||
from colossalai.lazy import LazyInitContext
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn import HybridAdam
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
# constants
|
||||
|
||||
@@ -159,7 +159,11 @@ if args.distplan == "colossalai":
|
||||
logger.info(f"plugin: {plugin}")
|
||||
booster = Booster(plugin=plugin, **booster_kwargs)
|
||||
|
||||
ctx = LazyInitContext(default_device=get_current_device()) if args.plugin == "gemini" else nullcontext()
|
||||
ctx = (
|
||||
LazyInitContext(default_device=get_accelerator().get_current_device())
|
||||
if args.plugin == "gemini"
|
||||
else nullcontext()
|
||||
)
|
||||
|
||||
with ctx:
|
||||
model = PaLM(num_tokens=50304, dim=4096, depth=64)
|
||||
|
Reference in New Issue
Block a user