[npu] change device to accelerator api (#5239)

* update accelerator

* fix timer

* fix amp

* update

* fix

* update bug

* add error raise

* fix autocast

* fix set device

* remove doc accelerator

* update doc

* update doc

* update doc

* use nullcontext

* update cpu

* update null context

* change time limit for example

* udpate

* update

* update

* update

* [npu] polish accelerator code

---------

Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com>
Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com>
This commit is contained in:
Hongxin Liu
2024-01-09 10:20:05 +08:00
committed by GitHub
parent dd2c28a323
commit d202cc28c0
128 changed files with 1773 additions and 868 deletions

View File

@@ -16,10 +16,10 @@ from utils.global_vars import get_tensorboard_writer, get_timers, set_global_var
from utils.logger import Logger
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.context import ParallelMode
from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
from colossalai.tensor import ProcessGroup, ShardSpec
from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext
@@ -53,7 +53,7 @@ def main():
set_global_variables(launch_time, args.tensorboard_path)
world_size = torch.distributed.get_world_size()
get_current_device()
get_accelerator().get_current_device()
# build model, optimizer and criterion
if args.distplan.startswith("CAI"):
@@ -67,7 +67,10 @@ def main():
# build GPT model
with ColoInitContext(
device=get_current_device(), dtype=torch.half, default_dist_spec=default_dist_spec, default_pg=shard_pg
device=get_accelerator().get_current_device(),
dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=shard_pg,
):
config, model, numel = get_model(args, logger)
@@ -78,7 +81,7 @@ def main():
elif args.distplan == "CAI_Gemini":
gemini_config = dict(
strict_ddp_mode=args.tp_degree == 1,
device=get_current_device(),
device=get_accelerator().get_current_device(),
placement_policy=args.placement,
pin_memory=True,
hidden_dim=model.config.hidden_size,

View File

@@ -20,11 +20,11 @@ from tqdm.auto import tqdm
from transformers import AutoTokenizer, PretrainedConfig
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
disable_existing_loggers()
logger = get_dist_logger()
@@ -386,7 +386,7 @@ def main(args):
cur_class_images = len(list(class_images_dir.iterdir()))
if cur_class_images < args.num_class_images:
torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
torch_dtype = torch.float16 if get_accelerator().get_current_device() == "cuda" else torch.float32
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
@@ -401,7 +401,7 @@ def main(args):
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
pipeline.to(get_current_device())
pipeline.to(get_accelerator().get_current_device())
for example in tqdm(
sample_dataloader,
@@ -578,8 +578,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae.to(get_current_device(), dtype=weight_dtype)
text_encoder.to(get_current_device(), dtype=weight_dtype)
vae.to(get_accelerator().get_current_device(), dtype=weight_dtype)
text_encoder.to(get_accelerator().get_current_device(), dtype=weight_dtype)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
@@ -613,7 +613,7 @@ def main(args):
torch.cuda.reset_peak_memory_stats()
# Move batch to gpu
for key, value in batch.items():
batch[key] = value.to(get_current_device(), non_blocking=True)
batch[key] = value.to(get_accelerator().get_current_device(), non_blocking=True)
# Convert images to latent space
optimizer.zero_grad()

View File

@@ -21,13 +21,13 @@ from tqdm.auto import tqdm
from transformers import AutoTokenizer, PretrainedConfig
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
disable_existing_loggers()
logger = get_dist_logger()
@@ -385,7 +385,7 @@ def main(args):
cur_class_images = len(list(class_images_dir.iterdir()))
if cur_class_images < args.num_class_images:
torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
torch_dtype = torch.float16 if get_accelerator().get_current_device() == "cuda" else torch.float32
pipeline = DiffusionPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
@@ -400,7 +400,7 @@ def main(args):
sample_dataset = PromptDataset(args.class_prompt, num_new_images)
sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
pipeline.to(get_current_device())
pipeline.to(get_accelerator().get_current_device())
for example in tqdm(
sample_dataloader,
@@ -598,8 +598,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae.to(get_current_device(), dtype=weight_dtype)
text_encoder.to(get_current_device(), dtype=weight_dtype)
vae.to(get_accelerator().get_current_device(), dtype=weight_dtype)
text_encoder.to(get_accelerator().get_current_device(), dtype=weight_dtype)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(len(train_dataloader))
@@ -633,7 +633,7 @@ def main(args):
torch.cuda.reset_peak_memory_stats()
# Move batch to gpu
for key, value in batch.items():
batch[key] = value.to(get_current_device(), non_blocking=True)
batch[key] = value.to(get_accelerator().get_current_device(), non_blocking=True)
# Convert images to latent space
optimizer.zero_grad()

View File

@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()

View File

@@ -33,9 +33,10 @@ def get_data_batch(batch_size, num_labels, num_channels=3, height=224, width=224
def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
from colossalai.accelerator import get_accelerator
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction
cuda_capacity = colo_device_memory_capacity(get_current_device())
cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print(f"Limiting GPU memory usage to {size_in_GB} GB")

View File

@@ -6,10 +6,9 @@ import torch.distributed as dist
import transformers
import colossalai
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
from colossalai.utils.device import get_current_device
GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024
@@ -52,7 +51,7 @@ CONFIG_MAP = {
def data_gen(batch_size: int = 4, seq_len: int = 512):
input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_current_device())
input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=get_accelerator().get_current_device())
attention_mask = torch.ones_like(input_ids)
data = dict(input_ids=input_ids, attention_mask=attention_mask)
return data
@@ -97,9 +96,9 @@ def print_details_info(outputs, model_config, args, whole_end2end):
msg += f"Flops: {num_parameters * num_bytes / whole_avg_latency / 1e12:.2f} TFLOPS\n"
if torch.cuda.is_available():
msg += f"-------Memory Summary Device:{device_utils.current_device()}-------\n"
msg += f"Max memory allocated: {device_utils.max_memory_allocated() / GIGABYTE:.2f} GB\n"
msg += f"Max memory reserved: {device_utils.max_memory_reserved() / GIGABYTE:.2f} GB\n"
msg += f"-------Memory Summary Device:{get_accelerator().current_device()}-------\n"
msg += f"Max memory allocated: {get_accelerator().max_memory_allocated() / GIGABYTE:.2f} GB\n"
msg += f"Max memory reserved: {get_accelerator().max_memory_reserved() / GIGABYTE:.2f} GB\n"
print(msg)

View File

@@ -5,9 +5,9 @@ import torch.distributed as dist
from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.inference import InferenceEngine
from colossalai.testing import spawn
from colossalai.utils.device import get_current_device
INPUT_TEXTS = [
"What is the longest river in the world?",
@@ -57,7 +57,7 @@ def run_inference(args):
)
inputs = tokenizer(INPUT_TEXTS, return_tensors="pt", padding="longest", max_length=max_input_len, truncation=True)
inputs = {k: v.to(get_current_device()) for k, v in inputs.items()}
inputs = {k: v.to(get_accelerator().get_current_device()) for k, v in inputs.items()}
outputs = engine.generate(inputs)
if rank == 0:

View File

@@ -18,11 +18,11 @@ from transformers import (
)
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -59,7 +59,7 @@ def evaluate_model(
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
accum_loss = torch.zeros(1, device=get_current_device())
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader:
batch = move_to_cuda(batch)
labels = batch["labels"]
@@ -88,8 +88,10 @@ def evaluate_model(
object_list = [None, None]
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels)
accum_loss.add_(object_list[1].to(get_current_device()))
metric.add_batch(
predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
)
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
else:
batch = move_to_cuda(batch)

View File

@@ -7,13 +7,13 @@ from model_zoo import GPTLMLoss, get_gpt2_components
from torch.utils._pytree import tree_map
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.auto_parallel.offload.amp_optimizer import AMPOptimizer
from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
from colossalai.auto_parallel.offload.solver import NOT_NVML
from colossalai.fx.profiler import parameter_size
from colossalai.nn.optimizer import HybridAdam
from colossalai.testing import spawn
from colossalai.utils import get_current_device
def parse_args():
@@ -41,7 +41,7 @@ def train_gpt(args):
64,
8,
),
device=get_current_device(),
device=get_accelerator().get_current_device(),
)
criterion = GPTLMLoss()

View File

@@ -12,12 +12,12 @@ from commons.utils import get_data, get_profile_context, get_tflops, get_time_st
from packaging import version
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.lazy import LazyInitContext
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
CAI_VERSION = colossalai.__version__
@@ -141,7 +141,11 @@ def main():
criterion = GPTLMLoss()
torch.manual_seed(123)
if args.distplan.startswith("CAI"):
ctx = LazyInitContext(default_device=get_current_device()) if args.distplan == "CAI_Gemini" else nullcontext()
ctx = (
LazyInitContext(default_device=get_accelerator().get_current_device())
if args.distplan == "CAI_Gemini"
else nullcontext()
)
# build GPT model
with ctx:
model = model_builder(args.model_type)(checkpoint=True)

View File

@@ -13,11 +13,11 @@ from tqdm import tqdm
from transformers import AutoConfig, GPT2ForSequenceClassification, get_linear_schedule_with_warmup
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -54,7 +54,7 @@ def evaluate_model(
use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
accum_loss = torch.zeros(1, device=get_current_device())
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader:
batch = move_to_cuda(batch)
labels = batch["labels"]
@@ -83,8 +83,10 @@ def evaluate_model(
object_list = [None, None]
dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group)
metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels)
accum_loss.add_(object_list[1].to(get_current_device()))
metric.add_batch(
predictions=object_list[0].to(get_accelerator().get_current_device()), references=labels
)
accum_loss.add_(object_list[1].to(get_accelerator().get_current_device()))
else:
batch = move_to_cuda(batch)

View File

@@ -5,6 +5,7 @@ from torch import nn as nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from colossalai.accelerator import get_accelerator
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
@@ -12,7 +13,6 @@ from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_b
from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
from colossalai.legacy.nn.layer.utils import divide
from colossalai.legacy.registry import LAYERS, LOSSES
from colossalai.utils import get_current_device
class VocabParallelEmbedding(torch.nn.Module):
@@ -96,7 +96,9 @@ class VocabParallelEmbedding(torch.nn.Module):
if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1])
if position_ids is None:
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
position_ids = torch.arange(
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
position_embeddings = self.position_embeddings(position_ids)
@@ -194,7 +196,7 @@ class VocabParallelEmbedding1D(torch.nn.Module):
self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
# Allocate weights and initialize.
factory_kwargs = {"device": get_current_device(), "dtype": dtype}
factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
self.weight = Parameter(torch.empty(self.num_embeddings_per_partition, self.embedding_dim, **factory_kwargs))
init.uniform_(self.weight, -1, 1)
@@ -439,7 +441,9 @@ class HiddenParallelEmbedding(torch.nn.Module):
if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1])
if position_ids is None:
position_ids = torch.arange(0, input_shape[-1] + 0, dtype=torch.long, device=get_current_device())
position_ids = torch.arange(
0, input_shape[-1] + 0, dtype=torch.long, device=get_accelerator().get_current_device()
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
position_embeddings = self.position_embeddings(position_ids)
@@ -532,7 +536,7 @@ class HiddenParallelEmbedding1D(torch.nn.Module):
self._weight = None
# Allocate weights and initialize.
factory_kwargs = {"device": get_current_device(), "dtype": dtype}
factory_kwargs = {"device": get_accelerator().get_current_device(), "dtype": dtype}
self.weight = Parameter(torch.empty(num_embeddings, embed_dim_per_partition, **factory_kwargs))
init.uniform_(self.weight, -1, 1)

View File

@@ -13,13 +13,12 @@ from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.llama.modeling_llama import LlamaForCausalLM
import colossalai
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Constants
@@ -166,7 +165,7 @@ def main():
# Initialize Model and Optimizer
# ==============================
init_ctx = (
LazyInitContext(default_device=get_current_device())
LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
else nullcontext()
)
@@ -197,7 +196,9 @@ def main():
torch.set_default_dtype(torch.bfloat16)
model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
torch.set_default_dtype(torch.float)
coordinator.print_on_master(f"Booster init max CUDA memory: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
coordinator.print_on_master(
f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
)
coordinator.print_on_master(
f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
)
@@ -223,7 +224,7 @@ def main():
performance_evaluator.on_step_end(**batch)
performance_evaluator.on_fit_end()
coordinator.print_on_master(f"Max CUDA memory usage: {device_utils.max_memory_allocated()/1024**2:.2f} MB")
coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
if __name__ == "__main__":

View File

@@ -8,7 +8,7 @@ from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import DataLoader, Dataset, DistributedSampler
from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator
class StatefulDistributedSampler(DistributedSampler):
@@ -108,7 +108,9 @@ class RandomDataset(Dataset):
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
self.num_samples = num_samples
self.max_length = max_length
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self):

View File

@@ -21,13 +21,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.models.llama.tokenization_llama import LlamaTokenizer
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def get_model_numel(model: nn.Module) -> int:
@@ -191,7 +191,9 @@ def main():
config = LlamaConfig.from_pretrained(args.model_path)
# use lazy init when using GeminiPlugin
init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext()
LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, GeminiPlugin)
else nullcontext()
)
with init_ctx:

View File

@@ -5,9 +5,8 @@ import torch
import torch.distributed as dist
from torch import Tensor
import colossalai.utils.device as device_utils
from colossalai.accelerator import get_accelerator
from colossalai.cluster import DistCoordinator
from colossalai.utils.device import get_current_device
def divide(x: float, y: float) -> float:
@@ -22,7 +21,7 @@ def divide(x: float, y: float) -> float:
def all_reduce_mean(x: float, world_size: int) -> float:
if world_size == 1:
return x
tensor = torch.tensor([x], device=get_current_device())
tensor = torch.tensor([x], device=get_accelerator().get_current_device())
dist.all_reduce(tensor)
tensor = tensor / world_size
return tensor.item()
@@ -86,13 +85,13 @@ class PerformanceEvaluator:
self.disable = self.ignore_steps > 0 and step < self.ignore_steps
if self.disable:
return
device_utils.synchronize()
get_accelerator().synchronize()
self.timer.start()
def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
if self.disable:
return
device_utils.synchronize()
get_accelerator().synchronize()
self.timer.end()
batch_size, seq_len = input_ids.shape

View File

@@ -20,13 +20,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.models.llama.tokenization_llama import LlamaTokenizer
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
MODEL_CONFIGS = {
"7b": LlamaConfig(max_position_embeddings=4096),
@@ -227,7 +227,9 @@ def main():
config = MODEL_CONFIGS[args.config]
# use lazy init when using GeminiPlugin
init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, GeminiPlugin) else nullcontext()
LazyInitContext(default_device=get_accelerator().get_current_device())
if isinstance(plugin, GeminiPlugin)
else nullcontext()
)
with init_ctx:

View File

@@ -14,6 +14,7 @@ from transformers.models.llama import LlamaConfig
from utils import PerformanceEvaluator, get_model_numel
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
@@ -21,7 +22,6 @@ from colossalai.moe.layers import apply_load_balance
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def move_to_cuda(batch, device):
@@ -64,13 +64,15 @@ class RandomDataset(Dataset):
)
self.input_ids.append(encode["input_ids"])
self.attention_mask.append(encode["attention_mask"])
self.input_ids = torch.cat(self.input_ids, dim=0).to(get_current_device())
self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_current_device())
self.input_ids = torch.cat(self.input_ids, dim=0).to(get_accelerator().get_current_device())
self.attention_mask = torch.cat(self.attention_mask, dim=0).to(get_accelerator().get_current_device())
repeat_times = num_samples // self.input_ids.shape[0] + 1
self.input_ids = self.input_ids.repeat(repeat_times, 1)[:num_samples]
self.attention_mask = self.attention_mask.repeat(repeat_times, 1)[:num_samples]
else:
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self):

View File

@@ -15,6 +15,7 @@ from transformers import T5Tokenizer
from transformers.models.llama import LlamaConfig
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
@@ -22,7 +23,6 @@ from colossalai.moe.layers import apply_load_balance
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
def move_to_cuda(batch, device):
@@ -61,7 +61,9 @@ class RandomDataset(Dataset):
def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000, tokenizer=None):
self.num_samples = num_samples
self.max_length = max_length
self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
self.input_ids = torch.randint(
0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
)
self.attention_mask = torch.ones_like(self.input_ids)
def __len__(self):

View File

@@ -14,12 +14,12 @@ from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
from torch.utils.data import DataLoader, Dataset
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.lazy import LazyInitContext
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn import HybridAdam
from colossalai.utils import get_current_device
# constants
@@ -159,7 +159,11 @@ if args.distplan == "colossalai":
logger.info(f"plugin: {plugin}")
booster = Booster(plugin=plugin, **booster_kwargs)
ctx = LazyInitContext(default_device=get_current_device()) if args.plugin == "gemini" else nullcontext()
ctx = (
LazyInitContext(default_device=get_accelerator().get_current_device())
if args.plugin == "gemini"
else nullcontext()
)
with ctx:
model = PaLM(num_tokens=50304, dim=4096, depth=64)

View File

@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()

View File

@@ -13,13 +13,13 @@ from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -73,8 +73,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()

View File

@@ -12,11 +12,11 @@ from tqdm import tqdm
from transformers import AutoConfig, BertForSequenceClassification, get_linear_schedule_with_warmup
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
@@ -45,7 +45,7 @@ def evaluate(
model.eval()
def evaluate_subset(dataloader: DataLoader):
accum_loss = torch.zeros(1, device=get_current_device())
accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
for batch in dataloader:
batch = move_to_cuda(batch)
outputs = model(**batch)

View File

@@ -51,13 +51,13 @@ from transformers import (
from transformers.utils.versions import require_version
import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.legacy.context import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.tensor import ProcessGroup
from colossalai.legacy.utils import get_dataloader
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
from colossalai.zero import GeminiOptimizer
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
@@ -249,9 +249,9 @@ def parse_args():
def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction
cuda_capacity = colo_device_memory_capacity(get_current_device())
cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print("Using {} GB of GPU memory".format(size_in_GB))
@@ -265,7 +265,9 @@ class DummyDataloader:
self.vocab_size = vocab_size
def generate(self):
input_ids = torch.randint(0, self.vocab_size, (self.batch_size, self.seq_len), device=get_current_device())
input_ids = torch.randint(
0, self.vocab_size, (self.batch_size, self.seq_len), device=get_accelerator().get_current_device()
)
attention_mask = torch.ones_like(input_ids)
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}
@@ -390,7 +392,7 @@ def main():
if args.init_in_cpu:
init_dev = torch.device("cpu")
else:
init_dev = get_current_device()
init_dev = get_accelerator().get_current_device()
cai_version = colossalai.__version__
logger.info(f"using Colossal-AI version {cai_version}")
@@ -439,7 +441,9 @@ def main():
except ImportError:
# this works for unreleased main branch, and this may be released on 0.2.9
from colossalai.zero import GeminiDDP
model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
model = GeminiDDP(
model, device=get_accelerator().get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True
)
elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
from colossalai.gemini import ChunkManager, GeminiManager