From 46503c35dd9342f943308ee451b62751f36bc961 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 1 Jun 2023 14:30:51 +0800
Subject: [PATCH 01/18] Modify torch version requirement to adapt torch 2.0

---
 colossalai/cli/launcher/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 6411b4302..4bb749f9d 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def get_launch_command(
         extra_launch_args = dict()
 
     torch_version = version.parse(torch.__version__)
-    assert torch_version.major == 1
+    assert torch_version.major >= 1
 
     if torch_version.minor < 9:
         cmd = [

From 60ec33bb183e410ace44435d45673d64fea080db Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Fri, 2 Jun 2023 16:50:51 +0800
Subject: [PATCH 02/18] Add a new example of Dreambooth training using the
 booster API

---
 .../tutorial/new_api/dreambooth/README.md     | 113 +++
 .../tutorial/new_api/dreambooth/colossalai.sh |  17 +
 .../new_api/dreambooth/requirements.txt       |   7 +
 .../tutorial/new_api/dreambooth/test_ci.sh    |  23 +
 .../dreambooth/train_dreambooth_colossalai.py | 690 ++++++++++++++++++
 5 files changed, 850 insertions(+)
 create mode 100644 examples/tutorial/new_api/dreambooth/README.md
 create mode 100755 examples/tutorial/new_api/dreambooth/colossalai.sh
 create mode 100644 examples/tutorial/new_api/dreambooth/requirements.txt
 create mode 100644 examples/tutorial/new_api/dreambooth/test_ci.sh
 create mode 100644 examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py

diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
new file mode 100644
index 000000000..bd7e7707a
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/README.md
@@ -0,0 +1,113 @@
+# [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) by [colossalai](https://github.com/hpcaitech/ColossalAI.git)
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
+The `train_dreambooth_colossalai.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
+
+## Installation
+
+To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6-11.8. Notice that you may want to make sure the module versions suitable for the whole environment. Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
+
+```bash
+pip install colossalai
+```
+
+**From source**
+
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+python setup.py install
+```
+
+## Dataset for Teyvat BLIP captions
+Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
+
+BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
+
+For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
+
+The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
+
+## New API
+We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
+We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
+For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
+## Training
+
+We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400 \
+  --placement="cuda"
+```
+- `MODEL_NAME` refers to the model you are training.
+- `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
+- `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
+- `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
+- `placement`  refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
+
+### Training with prior-preservation loss
+
+Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
+
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time. The general script can be then modified as the following.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=800 \
+  --placement="cuda"
+```
+
+
+
+## Invitation to open-source contribution
+Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
+
+You may contact us or participate in the following ways:
+1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
+2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
+3. Join the Colossal-AI community on
+[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
+and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
+4. Send your official proposal to email contact@hpcaitech.com
+
+Thanks so much to all of our amazing contributors!
diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
new file mode 100755
index 000000000..7cf8b3a13
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -0,0 +1,17 @@
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1 
+DIFFUSERS_OFFLINE=1
+
+torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your_prompt" \
+  --resolution=512 \
+  --plugin="gemini" \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda" \
diff --git a/examples/tutorial/new_api/dreambooth/requirements.txt b/examples/tutorial/new_api/dreambooth/requirements.txt
new file mode 100644
index 000000000..1ec828c63
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/requirements.txt
@@ -0,0 +1,7 @@
+diffusers>==0.5.0
+accelerate
+torchvision
+transformers>=4.21.0
+ftfy
+tensorboard
+modelcards
diff --git a/examples/tutorial/new_api/dreambooth/test_ci.sh b/examples/tutorial/new_api/dreambooth/test_ci.sh
new file mode 100644
index 000000000..68862c46c
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/test_ci.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -xe
+pip install -r requirements.txt
+
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your prompt" \
+  --resolution=512 \
+  --plugin=$plugin \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda"
+done
diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
new file mode 100644
index 000000000..9da7cacb8
--- /dev/null
+++ b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
@@ -0,0 +1,690 @@
+import argparse
+import hashlib
+import math
+import os
+from pathlib import Path
+from typing import Optional
+import shutil
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, create_repo, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import colossalai
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+from colossalai.zero import ColoInitContext
+from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+
+disable_existing_loggers()
+logger = get_dist_logger()
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default="a photo of sks dog",
+        required=False,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=("Minimal class images for prior preservation loss. If there are not enough images already present in"
+              " class_data_dir, additional images will be sampled with class_prompt."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=("The resolution for input images, all the images in the train/validation dataset will be resized to this"
+              " resolution"),
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default="cpu",
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+              " cropped. The images will be resized to the resolution first before cropping."),
+    )
+    parser.add_argument("--train_batch_size",
+                        type=int,
+                        default=4,
+                        help="Batch size (per device) for the training dataloader.")
+    parser.add_argument("--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images.")
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+              ' "constant", "constant_with_warmup"]'),
+    )
+    parser.add_argument("--lr_warmup_steps",
+                        type=int,
+                        default=500,
+                        help="Number of steps for the warmup in the lr scheduler.")
+    parser.add_argument("--use_8bit_adam",
+                        action="store_true",
+                        help="Whether or not to use 8-bit Adam from bitsandbytes.")
+
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+              " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose([
+            transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ])
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def main(args):
+    if args.seed is None:
+        colossalai.launch_from_torch(config={})
+    else:
+        colossalai.launch_from_torch(config={}, seed=args.seed)
+
+    local_rank = gpc.get_local_rank(ParallelMode.DATA)
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            pipeline.to(get_current_device())
+
+            for example in tqdm(
+                    sample_dataloader,
+                    desc="Generating class images",
+                    disable=not local_rank == 0,
+            ):
+                images = pipeline(example["prompt"]).images
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if local_rank == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        logger.info("Loading tokenizer from pretrained model", ranks=[0])
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+        # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
+
+    # Load models and create wrapper for stable diffusion
+
+    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
+
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+
+    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
+
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
+
+    # config optimizer for colossalai zero
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+
+    # load noise_scheduler
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    # prepare dataset
+    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {
+                "input_ids": input_ids
+            },
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+
+    train_dataloader = torch.utils.data.DataLoader(train_dataset,
+                                                   batch_size=args.train_batch_size,
+                                                   shuffle=True,
+                                                   collate_fn=collate_fn,
+                                                   num_workers=1)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(get_current_device(), dtype=weight_dtype)
+    text_encoder.to(get_current_device(), dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
+    # Train!
+    total_batch_size = args.train_batch_size * world_size
+
+    logger.info("***** Running training *****", ranks=[0])
+    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
+    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
+    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
+    progress_bar.set_description("Steps")
+    global_step = 0
+
+    torch.cuda.synchronize()
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            torch.cuda.reset_peak_memory_stats()
+            # Move batch to gpu
+            for key, value in batch.items():
+                batch[key] = value.to(get_current_device(), non_blocking=True)
+
+            # Convert images to latent space
+            optimizer.zero_grad()
+
+            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise = torch.randn_like(latents)
+            bsz = latents.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+            timesteps = timesteps.long()
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+            # Predict the noise residual
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            if args.with_prior_preservation:
+                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                target, target_prior = torch.chunk(target, 2, dim=0)
+
+                # Compute instance loss
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
+
+                # Compute prior loss
+                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+
+            optimizer.backward(loss)
+
+            optimizer.step()
+            lr_scheduler.step()
+            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            progress_bar.update(1)
+            global_step += 1
+            logs = {
+                "loss": loss.detach().item(),
+                "lr": optimizer.param_groups[0]["lr"],
+            }    # lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step % args.save_steps == 0:
+                torch.cuda.synchronize()
+                if local_rank == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
+                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
+            if global_step >= args.max_train_steps:
+                break
+    torch.cuda.synchronize()
+
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
+    if local_rank == 0:
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

From 42e3232bc045aa7ea2fb690625d8baf588b80ed1 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Fri, 2 Jun 2023 17:00:57 +0800
Subject: [PATCH 03/18] roll back

---
 colossalai/cli/launcher/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/run.py b/colossalai/cli/launcher/run.py
index 4bb749f9d..6411b4302 100644
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def get_launch_command(
         extra_launch_args = dict()
 
     torch_version = version.parse(torch.__version__)
-    assert torch_version.major >= 1
+    assert torch_version.major == 1
 
     if torch_version.minor < 9:
         cmd = [

From 25447d44079de7be9083d07834d75b74f5ce8680 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Mon, 5 Jun 2023 11:47:07 +0800
Subject: [PATCH 04/18] modify path

---
 examples/tutorial/new_api/dreambooth/colossalai.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
index 7cf8b3a13..2745c563a 100755
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -3,10 +3,10 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your_prompt" \
+  --pretrained_model_name_or_path="Path_to_your_model"  \
+  --instance_data_dir="Path_to_your_training_image" \
+  --output_dir="Path_to_your_save_dir" \
+  --instance_prompt="keli" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \

From 176010f2898cd4353313fc909bf4d2f5a65860a1 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:08:22 +0800
Subject: [PATCH 05/18] update performance evaluation

---
 examples/tutorial/new_api/dreambooth/README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
index bd7e7707a..8e1fdbbc8 100644
--- a/examples/tutorial/new_api/dreambooth/README.md
+++ b/examples/tutorial/new_api/dreambooth/README.md
@@ -40,6 +40,9 @@ We have modified our previous implementation of Dreambooth with our new Booster
 We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
 For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
 
+
+
+
 ## Training
 
 We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
@@ -97,7 +100,22 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --placement="cuda"
 ```
 
+## Performance
 
+|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
+|:--------------:|:----:|:----------:|:-----------:|:-------:|
+|  Traditional   |  1   |     16     |     oom     |    \    |
+|  Traditional   |  1   |     8      |    61.81    |    1    |
+|   torch_ddp    |  4   |     16     |     oom     |    \    |
+|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
+|     gemini     |  4   |     16     |    53.29    |    \    |
+|     gemini     |  4   |     8      |    29.36    |  2.00   |
+| low_level_zero |  4   |     16     |    52.80    |    \    |
+| low_level_zero |  4   |     8      |    28.87    |  2.02   |
+
+The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
+We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
+the memory cost and the throughput for the plugins.
 
 ## Invitation to open-source contribution
 Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!

From b56c7f428379843a29f690d237b9796747ecf339 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:09:27 +0800
Subject: [PATCH 06/18] update shell file

---
 examples/tutorial/new_api/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
index 2745c563a..77dfb1cbd 100755
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ b/examples/tutorial/new_api/dreambooth/colossalai.sh
@@ -6,7 +6,7 @@ torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Path_to_your_model"  \
   --instance_data_dir="Path_to_your_training_image" \
   --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="keli" \
+  --instance_prompt="your prompt" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \

From 1c1f71cbd2718feee7e6dbb472053664e26f1c8e Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 14:51:11 +0800
Subject: [PATCH 07/18] fixing insecure hash function

---
 .../tutorial/new_api/dreambooth/train_dreambooth_colossalai.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
index 9da7cacb8..5436e7d6b 100644
--- a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
@@ -397,7 +397,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 

From b29e1f07224298aea35aab7ee83284beac28e0d8 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 15:50:03 +0800
Subject: [PATCH 08/18] change directory

---
 examples/images/dreambooth/README.md          |  23 +
 examples/images/dreambooth/colossalai.sh      |  21 +-
 examples/images/dreambooth/test_ci.sh         |  23 +
 .../dreambooth/train_dreambooth_colossalai.py |  93 ++-
 .../train_dreambooth_colossalai_lora.py       | 120 +--
 .../tutorial/new_api/dreambooth/README.md     | 131 ----
 .../tutorial/new_api/dreambooth/colossalai.sh |  17 -
 .../new_api/dreambooth/requirements.txt       |   7 -
 .../tutorial/new_api/dreambooth/test_ci.sh    |  23 -
 .../dreambooth/train_dreambooth_colossalai.py | 690 ------------------
 10 files changed, 180 insertions(+), 968 deletions(-)
 delete mode 100644 examples/tutorial/new_api/dreambooth/README.md
 delete mode 100755 examples/tutorial/new_api/dreambooth/colossalai.sh
 delete mode 100644 examples/tutorial/new_api/dreambooth/requirements.txt
 delete mode 100644 examples/tutorial/new_api/dreambooth/test_ci.sh
 delete mode 100644 examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py

diff --git a/examples/images/dreambooth/README.md b/examples/images/dreambooth/README.md
index 7c117d841..bfd865a6d 100644
--- a/examples/images/dreambooth/README.md
+++ b/examples/images/dreambooth/README.md
@@ -92,6 +92,29 @@ torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
   --placement="cuda"
 ```
 
+## New API
+We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
+We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
+For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
+
+## Performance
+
+|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
+|:--------------:|:----:|:----------:|:-----------:|:-------:|
+|  Traditional   |  1   |     16     |     oom     |    \    |
+|  Traditional   |  1   |     8      |    61.81    |    1    |
+|   torch_ddp    |  4   |     16     |     oom     |    \    |
+|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
+|     gemini     |  4   |     16     |    53.29    |    \    |
+|     gemini     |  4   |     8      |    29.36    |  2.00   |
+| low_level_zero |  4   |     16     |    52.80    |    \    |
+| low_level_zero |  4   |     8      |    28.87    |  2.02   |
+
+The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
+We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
+the memory cost and the throughput for the plugins.
+
+
 ## Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. `--instance_prompt="a photo of sks dog" ` in the above example) in your prompt.
diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 227d8b8bd..cfb00412a 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -1,20 +1,15 @@
-export MODEL_NAME= <Your Pretrained Model Path> 
-export INSTANCE_DIR= <Your Input Pics Path>
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-HF_DATASETS_OFFLINE=1 
-TRANSFORMERS_OFFLINE=1 
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-torchrun --nproc_per_node 2 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of a dog" \
+torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Path_to_your_model"  \
+  --instance_data_dir="Path_to_your_training_image" \
+  --output_dir="Path_to_your_save_dir" \
+  --instance_prompt="your prompt" \
   --resolution=512 \
+  --plugin="gemini" \
   --train_batch_size=1 \
-  --gradient_accumulation_steps=1 \
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index e69de29bb..68862c46c 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -xe
+pip install -r requirements.txt
+
+HF_DATASETS_OFFLINE=1
+TRANSFORMERS_OFFLINE=1
+DIFFUSERS_OFFLINE=1
+
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
+  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
+  --instance_data_dir="Your Input Pics Path" \
+  --output_dir="path-to-save-model" \
+  --instance_prompt="your prompt" \
+  --resolution=512 \
+  --plugin=$plugin \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --placement="cuda"
+done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index d07febea0..5436e7d6b 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -4,6 +4,7 @@ import math
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -21,9 +22,12 @@ import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
+from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -58,6 +62,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -193,6 +204,12 @@ def parse_args(input_args=None):
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -339,18 +356,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -392,7 +397,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -452,12 +457,18 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -468,10 +479,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -554,6 +577,8 @@ def main(args):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -642,36 +667,24 @@ def main(args):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
                 if local_rank == 0:
-                    pipeline = DiffusionPipeline.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        unet=torch_unet,
-                        revision=args.revision,
-                    )
                     save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    pipeline.save_pretrained(save_path)
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=unet,
-            revision=args.revision,
-        )
-
-        pipeline.save_pretrained(args.output_dir)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 6715b473a..64cdd2a31 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -4,6 +4,7 @@ import math
 import os
 from pathlib import Path
 from typing import Optional
+import shutil
 
 import torch
 import torch.nn.functional as F
@@ -23,9 +24,12 @@ import colossalai
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
 from colossalai.zero.gemini import get_static_torch_model
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -60,6 +64,13 @@ def parse_args(input_args=None):
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--externel_unet_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to the externel unet model.",
+    )
     parser.add_argument(
         "--revision",
         type=str,
@@ -195,6 +206,12 @@ def parse_args(input_args=None):
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
+    parser.add_argument('-p',
+                        '--plugin',
+                        type=str,
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
+                        help="plugin to use")
     parser.add_argument(
         "--logging_dir",
         type=str,
@@ -341,18 +358,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-# Gemini + ZeRO DDP
-def gemini_zero_dpp(model: torch.nn.Module, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
-
-    model = GeminiDDP(model,
-                      device=get_current_device(),
-                      placement_policy=placement_policy,
-                      pin_memory=True,
-                      search_range_mb=64)
-    return model
-
-
 def main(args):
     if args.seed is None:
         colossalai.launch_from_torch(config={})
@@ -394,7 +399,7 @@ def main(args):
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
                     image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
                     image.save(image_filename)
 
@@ -454,32 +459,42 @@ def main(args):
         revision=args.revision,
     )
 
-    logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-    with ColoInitContext(device=get_current_device()):
+
+    if args.externel_unet_path is None:
+        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
         unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                    subfolder="unet",
-                                                    revision=args.revision,
-                                                    low_cpu_mem_usage=False)
-        unet.requires_grad_(False)
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    else:
+        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
+        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
+                                                subfolder="unet",
+                                                revision=args.revision,
+                                                low_cpu_mem_usage=False)
+    unet.requires_grad_(False)
 
-        # Set correct lora layers
-        lora_attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
+    # Set correct lora layers
+    lora_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
 
-            lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
-                                                           cross_attention_dim=cross_attention_dim)
+        lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
+                                                       cross_attention_dim=cross_attention_dim)
 
-        unet.set_attn_processor(lora_attn_procs)
-        lora_layers = AttnProcsLayers(unet.attn_processors)
+    unet.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(unet.attn_processors)
 
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
@@ -490,10 +505,22 @@ def main(args):
     if args.scale_lr:
         args.learning_rate = args.learning_rate * args.train_batch_size * world_size
 
-    unet = gemini_zero_dpp(unet, args.placement)
+    # Use Booster API to use Gemini/Zero with ColossalAI
+
+    booster_kwargs = {}
+    if args.plugin == 'torch_ddp_fp16':
+        booster_kwargs['mixed_precision'] = 'fp16'
+    if args.plugin.startswith('torch_ddp'):
+        plugin = TorchDDPPlugin()
+    elif args.plugin == 'gemini':
+        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+    elif args.plugin == 'low_level_zero':
+        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+
+    booster = Booster(plugin=plugin, **booster_kwargs)
 
     # config optimizer for colossalai zero
-    optimizer = GeminiAdamOptimizer(unet, lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
 
     # load noise_scheduler
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -576,6 +603,8 @@ def main(args):
     # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
+    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
+
     # Train!
     total_batch_size = args.train_batch_size * world_size
 
@@ -664,27 +693,24 @@ def main(args):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
-                torch_unet = get_static_torch_model(unet)
                 if local_rank == 0:
                     save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    torch_unet = torch_unet.to(torch.float32)
-                    torch_unet.save_attn_procs(save_path)
+                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
+                    if not os.path.exists(os.path.join(save_path, "config.json")):
+                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
             if global_step >= args.max_train_steps:
                 break
-
     torch.cuda.synchronize()
-    torch_unet = get_static_torch_model(unet)
 
+    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
+    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
     if local_rank == 0:
-        torch_unet = torch_unet.to(torch.float32)
-        torch_unet.save_attn_procs(save_path)
-        logger.info(f"Saving model checkpoint to {args.output_dir}", ranks=[0])
-
+        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
+            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
 
-
 if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/tutorial/new_api/dreambooth/README.md b/examples/tutorial/new_api/dreambooth/README.md
deleted file mode 100644
index 8e1fdbbc8..000000000
--- a/examples/tutorial/new_api/dreambooth/README.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) by [colossalai](https://github.com/hpcaitech/ColossalAI.git)
-
-[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject.
-The `train_dreambooth_colossalai.py` script shows how to implement the training procedure and adapt it for stable diffusion.
-
-By accommodating model data in CPU and GPU and moving the data to the computing device when necessary, [Gemini](https://www.colossalai.org/docs/advanced_tutorials/meet_gemini), the Heterogeneous Memory Manager of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) can breakthrough the GPU memory wall by using GPU and CPU memory (composed of CPU DRAM or nvme SSD memory) together at the same time. Moreover, the model scale can be further improved by combining heterogeneous training with the other parallel approaches, such as data parallel, tensor parallel and pipeline parallel.
-
-## Installation
-
-To begin with, make sure your operating system has the cuda version suitable for this exciting training session, which is cuda11.6-11.8. Notice that you may want to make sure the module versions suitable for the whole environment. Before running the scripts, make sure to install the library's training dependencies:
-
-```bash
-pip install -r requirements.txt
-```
-
-### Install [colossalai](https://github.com/hpcaitech/ColossalAI.git)
-
-```bash
-pip install colossalai
-```
-
-**From source**
-
-```bash
-git clone https://github.com/hpcaitech/ColossalAI.git
-python setup.py install
-```
-
-## Dataset for Teyvat BLIP captions
-Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
-
-BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
-
-For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
-
-The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
-
-## New API
-We have modified our previous implementation of Dreambooth with our new Booster API, which offers a more flexible and efficient way to train your model. The new API is more user-friendly and easy to use. You can find the new API in `train_dreambooth_colossalai.py`. 
-We have also offer a shell script `test_ci.sh` for you to go through all our plugins for the booster.
-For more information about the booster API you can refer to https://colossalai.org/docs/basics/booster_api/.
-
-
-
-
-## Training
-
-We provide the script `colossalai.sh` to run the training task with colossalai. For instance, the script of training process for [stable-diffusion-v1-4] model can be modified into:
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=400 \
-  --placement="cuda"
-```
-- `MODEL_NAME` refers to the model you are training.
-- `INSTANCE_DIR` refers to personalized path to instance images, you might need to insert information here.
-- `OUTPUT_DIR` refers to local path to save the trained model, you might need to find a path with enough space.
-- `resolution` refers to the corresponding resolution number of your target model. Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.
-- `placement`  refers to the training strategy supported by Colossal AI, default = 'cuda', which refers to loading all the parameters into cuda memory. On the other hand, 'cpu' refers to 'cpu offload' strategy while 'auto' enables 'Gemini', both featured by Colossal AI.
-
-### Training with prior-preservation loss
-
-Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
-
-According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time. The general script can be then modified as the following.
-
-```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-torchrun --nproc_per_node 2 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=800 \
-  --placement="cuda"
-```
-
-## Performance
-
-|    Strategy    | #GPU | Batch Size | GPU RAM(GB) | speedup |
-|:--------------:|:----:|:----------:|:-----------:|:-------:|
-|  Traditional   |  1   |     16     |     oom     |    \    |
-|  Traditional   |  1   |     8      |    61.81    |    1    |
-|   torch_ddp    |  4   |     16     |     oom     |    \    |
-|   torch_ddp    |  4   |     8      |    41.97    |  0.97   |
-|     gemini     |  4   |     16     |    53.29    |    \    |
-|     gemini     |  4   |     8      |    29.36    |  2.00   |
-| low_level_zero |  4   |     16     |    52.80    |    \    |
-| low_level_zero |  4   |     8      |    28.87    |  2.02   |
-
-The evaluation is performed on 4 Nvidia A100 GPUs with 80GB memory each, with GPU 0 & 1, 2 & 3 connected with NVLink.
-We finetuned the [stable-diffusion-v1-4](https://huggingface.co/stabilityai/stable-diffusion-v1-4) model with 512x512 resolution on the [Teyvat](https://huggingface.co/datasets/Fazzie/Teyvat) dataset and compared 
-the memory cost and the throughput for the plugins.
-
-## Invitation to open-source contribution
-Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models!
-
-You may contact us or participate in the following ways:
-1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
-2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
-3. Join the Colossal-AI community on
-[Slack](https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-z7b26eeb-CBp7jouvu~r0~lcFzX832w),
-and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
-4. Send your official proposal to email contact@hpcaitech.com
-
-Thanks so much to all of our amazing contributors!
diff --git a/examples/tutorial/new_api/dreambooth/colossalai.sh b/examples/tutorial/new_api/dreambooth/colossalai.sh
deleted file mode 100755
index 77dfb1cbd..000000000
--- a/examples/tutorial/new_api/dreambooth/colossalai.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-HF_DATASETS_OFFLINE=1
-TRANSFORMERS_OFFLINE=1 
-DIFFUSERS_OFFLINE=1
-
-torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Path_to_your_model"  \
-  --instance_data_dir="Path_to_your_training_image" \
-  --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="your prompt" \
-  --resolution=512 \
-  --plugin="gemini" \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --placement="cuda" \
diff --git a/examples/tutorial/new_api/dreambooth/requirements.txt b/examples/tutorial/new_api/dreambooth/requirements.txt
deleted file mode 100644
index 1ec828c63..000000000
--- a/examples/tutorial/new_api/dreambooth/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-diffusers>==0.5.0
-accelerate
-torchvision
-transformers>=4.21.0
-ftfy
-tensorboard
-modelcards
diff --git a/examples/tutorial/new_api/dreambooth/test_ci.sh b/examples/tutorial/new_api/dreambooth/test_ci.sh
deleted file mode 100644
index 68862c46c..000000000
--- a/examples/tutorial/new_api/dreambooth/test_ci.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-set -xe
-pip install -r requirements.txt
-
-HF_DATASETS_OFFLINE=1
-TRANSFORMERS_OFFLINE=1
-DIFFUSERS_OFFLINE=1
-
-for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your prompt" \
-  --resolution=512 \
-  --plugin=$plugin \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --num_class_images=200 \
-  --placement="cuda"
-done
diff --git a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py b/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
deleted file mode 100644
index 5436e7d6b..000000000
--- a/examples/tutorial/new_api/dreambooth/train_dreambooth_colossalai.py
+++ /dev/null
@@ -1,690 +0,0 @@
-import argparse
-import hashlib
-import math
-import os
-from pathlib import Path
-from typing import Optional
-import shutil
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
-from diffusers.optimization import get_scheduler
-from huggingface_hub import HfFolder, Repository, create_repo, whoami
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
-from colossalai.zero.gemini import get_static_torch_model
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
-
-disable_existing_loggers()
-logger = get_dist_logger()
-
-
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
-def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument(
-        "--pretrained_model_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--externel_unet_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to the externel unet model.",
-    )
-    parser.add_argument(
-        "--revision",
-        type=str,
-        default=None,
-        required=False,
-        help="Revision of pretrained model identifier from huggingface.co/models.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--instance_data_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="A folder containing the training data of instance images.",
-    )
-    parser.add_argument(
-        "--class_data_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="A folder containing the training data of class images.",
-    )
-    parser.add_argument(
-        "--instance_prompt",
-        type=str,
-        default="a photo of sks dog",
-        required=False,
-        help="The prompt with identifier specifying the instance",
-    )
-    parser.add_argument(
-        "--class_prompt",
-        type=str,
-        default=None,
-        help="The prompt to specify images in the same class as provided instance images.",
-    )
-    parser.add_argument(
-        "--with_prior_preservation",
-        default=False,
-        action="store_true",
-        help="Flag to add prior preservation loss.",
-    )
-    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
-    parser.add_argument(
-        "--num_class_images",
-        type=int,
-        default=100,
-        help=("Minimal class images for prior preservation loss. If there are not enough images already present in"
-              " class_data_dir, additional images will be sampled with class_prompt."),
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="text-inversion-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--resolution",
-        type=int,
-        default=512,
-        help=("The resolution for input images, all the images in the train/validation dataset will be resized to this"
-              " resolution"),
-    )
-    parser.add_argument(
-        "--placement",
-        type=str,
-        default="cpu",
-        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
-    )
-    parser.add_argument(
-        "--center_crop",
-        default=False,
-        action="store_true",
-        help=("Whether to center crop the input images to the resolution. If not set, the images will be randomly"
-              " cropped. The images will be resized to the resolution first before cropping."),
-    )
-    parser.add_argument("--train_batch_size",
-                        type=int,
-                        default=4,
-                        help="Batch size (per device) for the training dataloader.")
-    parser.add_argument("--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images.")
-    parser.add_argument("--num_train_epochs", type=int, default=1)
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--gradient_checkpointing",
-        action="store_true",
-        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-6,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument(
-        "--scale_lr",
-        action="store_true",
-        default=False,
-        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
-    )
-    parser.add_argument(
-        "--lr_scheduler",
-        type=str,
-        default="constant",
-        help=('The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-              ' "constant", "constant_with_warmup"]'),
-    )
-    parser.add_argument("--lr_warmup_steps",
-                        type=int,
-                        default=500,
-                        help="Number of steps for the warmup in the lr scheduler.")
-    parser.add_argument("--use_8bit_adam",
-                        action="store_true",
-                        help="Whether or not to use 8-bit Adam from bitsandbytes.")
-
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
-    parser.add_argument(
-        "--hub_model_id",
-        type=str,
-        default=None,
-        help="The name of the repository to keep in sync with the local `output_dir`.",
-    )
-    parser.add_argument('-p',
-                        '--plugin',
-                        type=str,
-                        default='torch_ddp',
-                        choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero'],
-                        help="plugin to use")
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=("[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-              " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."),
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default=None,
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
-            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
-            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-
-    if input_args is not None:
-        args = parser.parse_args(input_args)
-    else:
-        args = parser.parse_args()
-
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.with_prior_preservation:
-        if args.class_data_dir is None:
-            raise ValueError("You must specify a data directory for class images.")
-        if args.class_prompt is None:
-            raise ValueError("You must specify prompt for class images.")
-    else:
-        if args.class_data_dir is not None:
-            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
-        if args.class_prompt is not None:
-            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
-
-    return args
-
-
-class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    """
-
-    def __init__(
-        self,
-        instance_data_root,
-        instance_prompt,
-        tokenizer,
-        class_data_root=None,
-        class_prompt=None,
-        size=512,
-        center_crop=False,
-    ):
-        self.size = size
-        self.center_crop = center_crop
-        self.tokenizer = tokenizer
-
-        self.instance_data_root = Path(instance_data_root)
-        if not self.instance_data_root.exists():
-            raise ValueError("Instance images root doesn't exists.")
-
-        self.instance_images_path = list(Path(instance_data_root).iterdir())
-        self.num_instance_images = len(self.instance_images_path)
-        self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images
-
-        if class_data_root is not None:
-            self.class_data_root = Path(class_data_root)
-            self.class_data_root.mkdir(parents=True, exist_ok=True)
-            self.class_images_path = list(self.class_data_root.iterdir())
-            self.num_class_images = len(self.class_images_path)
-            self._length = max(self.num_class_images, self.num_instance_images)
-            self.class_prompt = class_prompt
-        else:
-            self.class_data_root = None
-
-        self.image_transforms = transforms.Compose([
-            transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5], [0.5]),
-        ])
-
-    def __len__(self):
-        return self._length
-
-    def __getitem__(self, index):
-        example = {}
-        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
-        if not instance_image.mode == "RGB":
-            instance_image = instance_image.convert("RGB")
-        example["instance_images"] = self.image_transforms(instance_image)
-        example["instance_prompt_ids"] = self.tokenizer(
-            self.instance_prompt,
-            padding="do_not_pad",
-            truncation=True,
-            max_length=self.tokenizer.model_max_length,
-        ).input_ids
-
-        if self.class_data_root:
-            class_image = Image.open(self.class_images_path[index % self.num_class_images])
-            if not class_image.mode == "RGB":
-                class_image = class_image.convert("RGB")
-            example["class_images"] = self.image_transforms(class_image)
-            example["class_prompt_ids"] = self.tokenizer(
-                self.class_prompt,
-                padding="do_not_pad",
-                truncation=True,
-                max_length=self.tokenizer.model_max_length,
-            ).input_ids
-
-        return example
-
-
-class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
-
-    def __init__(self, prompt, num_samples):
-        self.prompt = prompt
-        self.num_samples = num_samples
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, index):
-        example = {}
-        example["prompt"] = self.prompt
-        example["index"] = index
-        return example
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-def main(args):
-    if args.seed is None:
-        colossalai.launch_from_torch(config={})
-    else:
-        colossalai.launch_from_torch(config={}, seed=args.seed)
-
-    local_rank = gpc.get_local_rank(ParallelMode.DATA)
-    world_size = gpc.get_world_size(ParallelMode.DATA)
-
-    if args.with_prior_preservation:
-        class_images_dir = Path(args.class_data_dir)
-        if not class_images_dir.exists():
-            class_images_dir.mkdir(parents=True)
-        cur_class_images = len(list(class_images_dir.iterdir()))
-
-        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if get_current_device() == "cuda" else torch.float32
-            pipeline = DiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
-                safety_checker=None,
-                revision=args.revision,
-            )
-            pipeline.set_progress_bar_config(disable=True)
-
-            num_new_images = args.num_class_images - cur_class_images
-            logger.info(f"Number of class images to sample: {num_new_images}.")
-
-            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
-
-            pipeline.to(get_current_device())
-
-            for example in tqdm(
-                    sample_dataloader,
-                    desc="Generating class images",
-                    disable=not local_rank == 0,
-            ):
-                images = pipeline(example["prompt"]).images
-
-                for i, image in enumerate(images):
-                    hash_image = hashlib.sha256(image.tobytes()).hexdigest()
-                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
-                    image.save(image_filename)
-
-            del pipeline
-
-    # Handle the repository creation
-    if local_rank == 0:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-    # Load the tokenizer
-    if args.tokenizer_name:
-        logger.info(f"Loading tokenizer from {args.tokenizer_name}", ranks=[0])
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer_name,
-            revision=args.revision,
-            use_fast=False,
-        )
-    elif args.pretrained_model_name_or_path:
-        logger.info("Loading tokenizer from pretrained model", ranks=[0])
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrained_model_name_or_path,
-            subfolder="tokenizer",
-            revision=args.revision,
-            use_fast=False,
-        )
-        # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path)
-
-    # Load models and create wrapper for stable diffusion
-
-    logger.info(f"Loading text_encoder from {args.pretrained_model_name_or_path}", ranks=[0])
-
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=args.revision,
-    )
-
-    logger.info(f"Loading AutoencoderKL from {args.pretrained_model_name_or_path}", ranks=[0])
-    vae = AutoencoderKL.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="vae",
-        revision=args.revision,
-    )
-
-
-    if args.externel_unet_path is None:
-        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
-        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
-                                                subfolder="unet",
-                                                revision=args.revision,
-                                                low_cpu_mem_usage=False)
-    else:
-        logger.info(f"Loading UNet2DConditionModel from {args.externel_unet_path}", ranks=[0])
-        unet = UNet2DConditionModel.from_pretrained(args.externel_unet_path,
-                                                revision=args.revision,
-                                                low_cpu_mem_usage=False)
-
-    vae.requires_grad_(False)
-    text_encoder.requires_grad_(False)
-
-    if args.gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-
-    if args.scale_lr:
-        args.learning_rate = args.learning_rate * args.train_batch_size * world_size
-
-    # Use Booster API to use Gemini/Zero with ColossalAI
-
-    booster_kwargs = {}
-    if args.plugin == 'torch_ddp_fp16':
-        booster_kwargs['mixed_precision'] = 'fp16'
-    if args.plugin.startswith('torch_ddp'):
-        plugin = TorchDDPPlugin()
-    elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
-    elif args.plugin == 'low_level_zero':
-        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
-
-    booster = Booster(plugin=plugin, **booster_kwargs)
-
-    # config optimizer for colossalai zero
-    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
-
-    # load noise_scheduler
-    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-
-    # prepare dataset
-    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
-        instance_prompt=args.instance_prompt,
-        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
-        class_prompt=args.class_prompt,
-        tokenizer=tokenizer,
-        size=args.resolution,
-        center_crop=args.center_crop,
-    )
-
-    def collate_fn(examples):
-        input_ids = [example["instance_prompt_ids"] for example in examples]
-        pixel_values = [example["instance_images"] for example in examples]
-
-        # Concat class and instance examples for prior preservation.
-        # We do this to avoid doing two forward passes.
-        if args.with_prior_preservation:
-            input_ids += [example["class_prompt_ids"] for example in examples]
-            pixel_values += [example["class_images"] for example in examples]
-
-        pixel_values = torch.stack(pixel_values)
-        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
-
-        input_ids = tokenizer.pad(
-            {
-                "input_ids": input_ids
-            },
-            padding="max_length",
-            max_length=tokenizer.model_max_length,
-            return_tensors="pt",
-        ).input_ids
-
-        batch = {
-            "input_ids": input_ids,
-            "pixel_values": pixel_values,
-        }
-        return batch
-
-    train_dataloader = torch.utils.data.DataLoader(train_dataset,
-                                                   batch_size=args.train_batch_size,
-                                                   shuffle=True,
-                                                   collate_fn=collate_fn,
-                                                   num_workers=1)
-
-    # Scheduler and math around the number of training steps.
-    overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        overrode_max_train_steps = True
-
-    lr_scheduler = get_scheduler(
-        args.lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=args.lr_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-    weight_dtype = torch.float32
-    if args.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif args.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move text_encode and vae to gpu.
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    vae.to(get_current_device(), dtype=weight_dtype)
-    text_encoder.to(get_current_device(), dtype=weight_dtype)
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
-    if overrode_max_train_steps:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
-    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    unet, optimizer, _, _, lr_scheduler = booster.boost(unet, optimizer, lr_scheduler=lr_scheduler)
-
-    # Train!
-    total_batch_size = args.train_batch_size * world_size
-
-    logger.info("***** Running training *****", ranks=[0])
-    logger.info(f"  Num examples = {len(train_dataset)}", ranks=[0])
-    logger.info(f"  Num batches each epoch = {len(train_dataloader)}", ranks=[0])
-    logger.info(f"  Num Epochs = {args.num_train_epochs}", ranks=[0])
-    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}", ranks=[0])
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}", ranks=[0])
-    logger.info(f"  Total optimization steps = {args.max_train_steps}", ranks=[0])
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not local_rank == 0)
-    progress_bar.set_description("Steps")
-    global_step = 0
-
-    torch.cuda.synchronize()
-    for epoch in range(args.num_train_epochs):
-        unet.train()
-        for step, batch in enumerate(train_dataloader):
-            torch.cuda.reset_peak_memory_stats()
-            # Move batch to gpu
-            for key, value in batch.items():
-                batch[key] = value.to(get_current_device(), non_blocking=True)
-
-            # Convert images to latent space
-            optimizer.zero_grad()
-
-            latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-            latents = latents * 0.18215
-
-            # Sample noise that we'll add to the latents
-            noise = torch.randn_like(latents)
-            bsz = latents.shape[0]
-            # Sample a random timestep for each image
-            timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-            timesteps = timesteps.long()
-
-            # Add noise to the latents according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-
-            # Get the text embedding for conditioning
-            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
-
-            # Predict the noise residual
-            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-            # Get the target for loss depending on the prediction type
-            if noise_scheduler.config.prediction_type == "epsilon":
-                target = noise
-            elif noise_scheduler.config.prediction_type == "v_prediction":
-                target = noise_scheduler.get_velocity(latents, noise, timesteps)
-            else:
-                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-
-            if args.with_prior_preservation:
-                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
-                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
-                target, target_prior = torch.chunk(target, 2, dim=0)
-
-                # Compute instance loss
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
-
-                # Compute prior loss
-                prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
-
-                # Add the prior loss to the instance loss.
-                loss = loss + args.prior_loss_weight * prior_loss
-            else:
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-
-            optimizer.backward(loss)
-
-            optimizer.step()
-            lr_scheduler.step()
-            logger.info(f"max GPU_mem cost is {torch.cuda.max_memory_allocated()/2**20} MB", ranks=[0])
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            progress_bar.update(1)
-            global_step += 1
-            logs = {
-                "loss": loss.detach().item(),
-                "lr": optimizer.param_groups[0]["lr"],
-            }    # lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-
-            if global_step % args.save_steps == 0:
-                torch.cuda.synchronize()
-                if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
-                    if not os.path.exists(os.path.join(save_path, "config.json")):
-                        shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
-                    logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
-            if global_step >= args.max_train_steps:
-                break
-    torch.cuda.synchronize()
-
-    booster.save_model(unet, os.path.join(args.output_dir, "diffusion_pytorch_model.bin"))
-    logger.info(f"Saving model checkpoint to {args.output_dir} on rank {local_rank}")
-    if local_rank == 0:
-        if not os.path.exists(os.path.join(args.output_dir, "config.json")):
-            shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), args.output_dir)
-        if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)

From d3379f0be7e30854ee2353924d735642f4909aab Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:07:34 +0800
Subject: [PATCH 09/18] fixed model saving bugs

---
 examples/images/dreambooth/train_dreambooth_colossalai.py     | 4 ++--
 .../images/dreambooth/train_dreambooth_colossalai_lora.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 5436e7d6b..eae52b5ec 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -667,9 +667,9 @@ def main(args):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                     if not os.path.exists(os.path.join(save_path, "config.json")):
                         shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index 64cdd2a31..dce65ff51 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -693,9 +693,9 @@ def main(args):
 
             if global_step % args.save_steps == 0:
                 torch.cuda.synchronize()
+                save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                 if local_rank == 0:
-                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    booster.save_model(unet, os.path.join(save_path, "diffusion_pytorch_model.bin"))
                     if not os.path.exists(os.path.join(save_path, "config.json")):
                         shutil.copy(os.path.join(args.pretrained_model_name_or_path, "unet/config.json"), save_path)
                     logger.info(f"Saving model checkpoint to {save_path}", ranks=[0])

From 79c9f776a9ea42991df54d11e2c4b3ac4a7eeea9 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:20:45 +0800
Subject: [PATCH 10/18] fixed port

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 68862c46c..0209c547a 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -7,7 +7,7 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Your Pretrained Model Path"  \
   --instance_data_dir="Your Input Pics Path" \
   --output_dir="path-to-save-model" \

From b4437e88c319874269b022c68e177f95d45b607b Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Tue, 6 Jun 2023 16:21:38 +0800
Subject: [PATCH 11/18] fixed port

---
 examples/images/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index cfb00412a..54ebac39b 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -2,7 +2,7 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-torchrun --nproc_per_node 4 --master_port=25641 train_dreambooth_colossalai.py \
+torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="Path_to_your_model"  \
   --instance_data_dir="Path_to_your_training_image" \
   --output_dir="Path_to_your_save_dir" \

From 4fc8bc68ac707302ad7d47706778f42a4d5031bf Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Wed, 7 Jun 2023 11:02:19 +0800
Subject: [PATCH 12/18] modify file path

---
 examples/images/dreambooth/colossalai.sh | 8 ++++----
 examples/images/dreambooth/dreambooth.sh | 6 +++---
 examples/images/dreambooth/test_ci.sh    | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 54ebac39b..3b15ad887 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -3,10 +3,10 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Path_to_your_model"  \
-  --instance_data_dir="Path_to_your_training_image" \
-  --output_dir="Path_to_your_save_dir" \
-  --instance_prompt="your prompt" \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
   --resolution=512 \
   --plugin="gemini" \
   --train_batch_size=1 \
diff --git a/examples/images/dreambooth/dreambooth.sh b/examples/images/dreambooth/dreambooth.sh
index e063bc827..f6b8f5e1b 100644
--- a/examples/images/dreambooth/dreambooth.sh
+++ b/examples/images/dreambooth/dreambooth.sh
@@ -1,7 +1,7 @@
 python train_dreambooth.py \
-    --pretrained_model_name_or_path= ## Your Model Path  \
-    --instance_data_dir=  ## Your Training Input Pics Path \
-    --output_dir="path-to-save-model" \
+    --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4" \
+    --instance_data_dir="/data/dreambooth/Teyvat/data" \
+    --output_dir="./weight_output" \
     --instance_prompt="a photo of a dog" \
     --resolution=512 \
     --train_batch_size=1 \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 0209c547a..c0b0c2b3d 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -8,10 +8,10 @@ DIFFUSERS_OFFLINE=1
 
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
   torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="Your Pretrained Model Path"  \
-  --instance_data_dir="Your Input Pics Path" \
-  --output_dir="path-to-save-model" \
-  --instance_prompt="your prompt" \
+  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+  --instance_data_dir="/data/dreambooth/Teyvat/data" \
+  --output_dir="./weight_output" \
+  --instance_prompt="a picture of a dog" \
   --resolution=512 \
   --plugin=$plugin \
   --train_batch_size=1 \

From c94a33579b7c70d96905ea8b2c3a4baf28451cb0 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Wed, 7 Jun 2023 17:23:01 +0800
Subject: [PATCH 13/18] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index c0b0c2b3d..8ba413a14 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -6,8 +6,9 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
-  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
+#  "torch_ddp" "torch_ddp_fp16"
+for plugin in "low_level_zero" "gemini"; do
+  torchrun --nproc_per_node 8 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
   --output_dir="./weight_output" \

From cf4792c9757e071217f0b99f4e2bcc85f2d048b7 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 11:15:10 +0800
Subject: [PATCH 14/18] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 8ba413a14..8d18e1d4a 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -8,7 +8,7 @@ DIFFUSERS_OFFLINE=1
 
 #  "torch_ddp" "torch_ddp_fp16"
 for plugin in "low_level_zero" "gemini"; do
-  torchrun --nproc_per_node 8 --standalone train_dreambooth_colossalai.py \
+  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
   --output_dir="./weight_output" \

From 039854b39165ab7f2a4fa7ab3d67e47daa325d1c Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:17:58 +0800
Subject: [PATCH 15/18] modify shell for check

---
 examples/images/dreambooth/test_ci.sh                     | 6 +++---
 examples/images/dreambooth/train_dreambooth_colossalai.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 8d18e1d4a..35c81b325 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -6,8 +6,8 @@ HF_DATASETS_OFFLINE=1
 TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
-#  "torch_ddp" "torch_ddp_fp16"
-for plugin in "low_level_zero" "gemini"; do
+#  "torch_ddp" "torch_ddp_fp16" "low_level_zero"
+for plugin in "gemini"; do
   torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
   --instance_data_dir="/data/dreambooth/Teyvat/data" \
@@ -20,5 +20,5 @@ for plugin in "low_level_zero" "gemini"; do
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cuda"
+  --placement="cpu" # "cuda"
 done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index eae52b5ec..44bde9226 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -487,7 +487,7 @@ def main(args):
     if args.plugin.startswith('torch_ddp'):
         plugin = TorchDDPPlugin()
     elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
     elif args.plugin == 'low_level_zero':
         plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
 

From 49567d56d161dba7889496abd4e74e19ed8d1195 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:36:05 +0800
Subject: [PATCH 16/18] modify shell for check

---
 examples/images/dreambooth/test_ci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 35c81b325..0e3f6efa4 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -20,5 +20,5 @@ for plugin in "gemini"; do
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cpu" # "cuda"
+  --placement="auto" # "cuda"
 done

From 730a092ba2dd98464bd18789b7f78d2ec2d3a165 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 13:38:18 +0800
Subject: [PATCH 17/18] modify shell for check

---
 examples/images/dreambooth/colossalai.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index 3b15ad887..b2a544928 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -14,4 +14,4 @@ torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --placement="cuda" \
+  --placement="auto" \

From 9b5e7ce21feb51977d11da4e6a0ed35f502dbfb5 Mon Sep 17 00:00:00 2001
From: Maruyama_Aya <china6280111@126.com>
Date: Thu, 8 Jun 2023 14:56:56 +0800
Subject: [PATCH 18/18] modify shell for check

---
 examples/images/dreambooth/colossalai.sh                  | 1 +
 examples/images/dreambooth/test_ci.sh                     | 1 +
 examples/images/dreambooth/train_dreambooth_colossalai.py | 5 +++++
 3 files changed, 7 insertions(+)

diff --git a/examples/images/dreambooth/colossalai.sh b/examples/images/dreambooth/colossalai.sh
index b2a544928..db4562dbc 100755
--- a/examples/images/dreambooth/colossalai.sh
+++ b/examples/images/dreambooth/colossalai.sh
@@ -14,4 +14,5 @@ torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
+  --test_run=True \
   --placement="auto" \
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 0e3f6efa4..21f45adae 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -19,6 +19,7 @@ for plugin in "gemini"; do
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
+  --test_run=True \
   --num_class_images=200 \
   --placement="auto" # "cuda"
 done
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index 44bde9226..888b28de8 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -198,6 +198,7 @@ def parse_args(input_args=None):
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
     parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument("--test_run", default=False, help="Whether to use a smaller dataset for test run.")
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -267,6 +268,7 @@ class DreamBoothDataset(Dataset):
         class_prompt=None,
         size=512,
         center_crop=False,
+        test=False,
     ):
         self.size = size
         self.center_crop = center_crop
@@ -277,6 +279,8 @@ class DreamBoothDataset(Dataset):
             raise ValueError("Instance images root doesn't exists.")
 
         self.instance_images_path = list(Path(instance_data_root).iterdir())
+        if test:
+            self.instance_images_path = self.instance_images_path[:10]
         self.num_instance_images = len(self.instance_images_path)
         self.instance_prompt = instance_prompt
         self._length = self.num_instance_images
@@ -509,6 +513,7 @@ def main(args):
         tokenizer=tokenizer,
         size=args.resolution,
         center_crop=args.center_crop,
+        test=args.test_run
     )
 
     def collate_fn(examples):