mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-18 16:00:49 +00:00
support stable diffusion v2
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
@@ -0,0 +1,67 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
@@ -0,0 +1,158 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-05
|
||||
target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: hybrid
|
||||
scale_factor: 0.18215
|
||||
monitor: val/loss_simple_ema
|
||||
finetune_keys: null
|
||||
use_ema: False
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 9
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
|
||||
data:
|
||||
target: ldm.data.laion.WebDataModuleFromConfig
|
||||
params:
|
||||
tar_base: null # for concat as in LAION-A
|
||||
p_unsafe_threshold: 0.1
|
||||
filter_word_list: "data/filters.yaml"
|
||||
max_pwatermark: 0.45
|
||||
batch_size: 8
|
||||
num_workers: 6
|
||||
multinode: True
|
||||
min_size: 512
|
||||
train:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
|
||||
- "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
|
||||
shuffle: 10000
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
# NOTE use enough shards to avoid empty validation loops in workers
|
||||
validation:
|
||||
shards:
|
||||
- "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
|
||||
shuffle: 0
|
||||
image_key: jpg
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
interpolation: 3
|
||||
- target: torchvision.transforms.CenterCrop
|
||||
params:
|
||||
size: 512
|
||||
postprocess:
|
||||
target: ldm.data.laion.AddMask
|
||||
params:
|
||||
mode: "512train-large"
|
||||
p_drop: 0.25
|
||||
|
||||
lightning:
|
||||
find_unused_parameters: True
|
||||
modelcheckpoint:
|
||||
params:
|
||||
every_n_train_steps: 5000
|
||||
|
||||
callbacks:
|
||||
metrics_over_trainsteps_checkpoint:
|
||||
params:
|
||||
every_n_train_steps: 10000
|
||||
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
enable_autocast: False
|
||||
disabled: False
|
||||
batch_frequency: 1000
|
||||
max_images: 4
|
||||
increase_log_steps: False
|
||||
log_first_step: False
|
||||
log_images_kwargs:
|
||||
use_ema_scope: False
|
||||
inpaint: False
|
||||
plot_progressive_rows: False
|
||||
plot_diffusion_rows: False
|
||||
N: 4
|
||||
unconditional_guidance_scale: 5.0
|
||||
unconditional_guidance_label: [""]
|
||||
ddim_steps: 50 # todo check these out for depth2img,
|
||||
ddim_eta: 0.0 # todo check these out for depth2img,
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
val_check_interval: 5000000
|
||||
num_sanity_val_steps: 0
|
||||
accumulate_grad_batches: 1
|
@@ -0,0 +1,72 @@
|
||||
model:
|
||||
base_learning_rate: 5.0e-07
|
||||
target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: hybrid
|
||||
scale_factor: 0.18215
|
||||
monitor: val/loss_simple_ema
|
||||
finetune_keys: null
|
||||
use_ema: False
|
||||
|
||||
depth_stage_config:
|
||||
target: ldm.modules.midas.api.MiDaSInference
|
||||
params:
|
||||
model_type: "dpt_hybrid"
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 5
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
@@ -0,0 +1,75 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
low_scale_key: "lr"
|
||||
linear_start: 0.0001
|
||||
linear_end: 0.02
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 128
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: "hybrid-adm"
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.08333
|
||||
use_ema: False
|
||||
|
||||
low_scale_config:
|
||||
target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
|
||||
params:
|
||||
noise_schedule_config: # image space
|
||||
linear_start: 0.0001
|
||||
linear_end: 0.02
|
||||
max_noise_level: 350
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
num_classes: 1000 # timesteps for noise conditioning (here constant, just need one)
|
||||
image_size: 128
|
||||
in_channels: 7
|
||||
out_channels: 4
|
||||
model_channels: 256
|
||||
attention_resolutions: [ 2,4,8]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 2, 4]
|
||||
disable_self_attentions: [True, True, True, False]
|
||||
disable_middle_self_attn: False
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
use_linear_in_transformer: True
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
ddconfig:
|
||||
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
|
||||
double_z: True
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: [ ]
|
||||
dropout: 0.0
|
||||
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
25
examples/images/diffusion/configs/Teyvat/README.md
Normal file
25
examples/images/diffusion/configs/Teyvat/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Dataset Card for Teyvat BLIP captions
|
||||
Dataset used to train [Teyvat characters text to image model](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion).
|
||||
|
||||
BLIP generated captions for characters images from [genshin-impact fandom wiki](https://genshin-impact.fandom.com/wiki/Character#Playable_Characters)and [biligame wiki for genshin impact](https://wiki.biligame.com/ys/%E8%A7%92%E8%89%B2).
|
||||
|
||||
For each row the dataset contains `image` and `text` keys. `image` is a varying size PIL png, and `text` is the accompanying text caption. Only a train split is provided.
|
||||
|
||||
The `text` include the tag `Teyvat`, `Name`,`Element`, `Weapon`, `Region`, `Model type`, and `Description`, the `Description` is captioned with the [pre-trained BLIP model](https://github.com/salesforce/BLIP).
|
||||
## Examples
|
||||
|
||||
<img src = "https://huggingface.co/datasets/Fazzie/Teyvat/resolve/main/data/Ganyu_001.png" title = "Ganyu_001.png" style="max-width: 20%;" >
|
||||
|
||||
> Teyvat, Name:Ganyu, Element:Cryo, Weapon:Bow, Region:Liyue, Model type:Medium Female, Description:an anime character with blue hair and blue eyes
|
||||
|
||||
<img src = "https://huggingface.co/datasets/Fazzie/Teyvat/resolve/main/data/Ganyu_002.png" title = "Ganyu_002.png" style="max-width: 20%;" >
|
||||
|
||||
> Teyvat, Name:Ganyu, Element:Cryo, Weapon:Bow, Region:Liyue, Model type:Medium Female, Description:an anime character with blue hair and blue eyes
|
||||
|
||||
<img src = "https://huggingface.co/datasets/Fazzie/Teyvat/resolve/main/data/Keqing_003.png" title = "Keqing_003.png" style="max-width: 20%;" >
|
||||
|
||||
> Teyvat, Name:Keqing, Element:Electro, Weapon:Sword, Region:Liyue, Model type:Medium Female, Description:a anime girl with long white hair and blue eyes
|
||||
|
||||
<img src = "https://huggingface.co/datasets/Fazzie/Teyvat/resolve/main/data/Keqing_004.png" title = "Keqing_004.png" style="max-width: 20%;" >
|
||||
|
||||
> Teyvat, Name:Keqing, Element:Electro, Weapon:Sword, Region:Liyue, Model type:Medium Female, Description:an anime character wearing a purple dress and cat ears
|
@@ -1,7 +1,8 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
@@ -11,11 +12,11 @@ model:
|
||||
cond_stage_key: txt
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
@@ -26,31 +27,33 @@ model:
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: False
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
@@ -69,9 +72,10 @@ model:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
use_fp16: True
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
@@ -86,37 +90,37 @@ data:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
# - target: torchvision.transforms.RandomCrop
|
||||
# params:
|
||||
# size: 256
|
||||
# - target: torchvision.transforms.RandomHorizontalFlip
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
accelerator: 'gpu'
|
||||
devices: 2
|
||||
log_gpu_memory: all
|
||||
max_epochs: 10
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: lightning.pytorch.strategies.ColossalAIStrategy
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: False
|
||||
enable_distributed_storage: True,
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: False
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: auto
|
||||
force_outputs_fp32: true
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
default_root_dir: "/tmp/diff_log/"
|
||||
profiler: pytorch
|
||||
# profiler: pytorch
|
||||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: lightning.pytorch.loggers.WandbLogger
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
id: nowname
|
@@ -1,21 +1,22 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
cond_stage_key: txt
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
@@ -26,31 +27,33 @@ model:
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: False
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
@@ -69,9 +72,10 @@ model:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
use_fp16: True
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
@@ -87,30 +91,30 @@ data:
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
devices: 4
|
||||
accelerator: 'gpu'
|
||||
devices: 1
|
||||
log_gpu_memory: all
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: lightning.pytorch.strategies.ColossalAIStrategy
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: False
|
||||
enable_distributed_storage: True,
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: False
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: auto
|
||||
force_outputs_fp32: true
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
default_root_dir: "/tmp/diff_log/"
|
||||
profiler: pytorch
|
||||
# profiler: pytorch
|
||||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: lightning.pytorch.loggers.WandbLogger
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
id: nowname
|
||||
|
@@ -1,7 +1,8 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
@@ -11,11 +12,11 @@ model:
|
||||
cond_stage_key: txt
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
@@ -26,31 +27,33 @@ model:
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: False
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
@@ -69,9 +72,10 @@ model:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
use_fp16: True
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
@@ -94,30 +98,30 @@ data:
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
devices: 2
|
||||
accelerator: 'gpu'
|
||||
devices: 1
|
||||
log_gpu_memory: all
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: lightning.pytorch.strategies.ColossalAIStrategy
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: False
|
||||
enable_distributed_storage: True,
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: False
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: auto
|
||||
force_outputs_fp32: true
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
default_root_dir: "/tmp/diff_log/"
|
||||
profiler: pytorch
|
||||
# profiler: pytorch
|
||||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: lightning.pytorch.loggers.WandbLogger
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
id: nowname
|
||||
|
@@ -1,56 +1,59 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 32
|
||||
cond_stage_key: txt
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 100 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: False
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
@@ -69,32 +72,39 @@ model:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
use_fp16: True
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 64
|
||||
wrap: False
|
||||
batch_size: 16
|
||||
num_workers: 4
|
||||
train:
|
||||
target: ldm.data.base.Txt2ImgIterableBaseDataset
|
||||
target: ldm.data.teyvat.hf_dataset
|
||||
params:
|
||||
file_path: "/data/scratch/diffuser/laion_part0/"
|
||||
world_size: 1
|
||||
rank: 0
|
||||
path: Fazzie/Teyvat
|
||||
image_transforms:
|
||||
- target: torchvision.transforms.Resize
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomCrop
|
||||
params:
|
||||
size: 512
|
||||
- target: torchvision.transforms.RandomHorizontalFlip
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
devices: 4
|
||||
devices: 2
|
||||
log_gpu_memory: all
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: lightning.pytorch.strategies.DDPStrategy
|
||||
target: strategies.DDPStrategy
|
||||
params:
|
||||
find_unused_parameters: False
|
||||
log_every_n_steps: 2
|
||||
@@ -105,9 +115,9 @@ lightning:
|
||||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: lightning.pytorch.loggers.WandbLogger
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
save_dir: "/data2/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
id: nowname
|
||||
|
@@ -1,57 +1,59 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 32
|
||||
cond_stage_key: txt
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
check_nan_inf: False
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 10000 ]
|
||||
warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-4 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
f_min: [ 1.e-10 ]
|
||||
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/unet/diffusion_pytorch_model.bin'
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: False
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
from_pretrained: '/data/scratch/diffuser/stable-diffusion-v1-4/vae/diffusion_pytorch_model.bin'
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
@@ -70,9 +72,10 @@ model:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
use_fp16: True
|
||||
freeze: True
|
||||
layer: "penultimate"
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
@@ -88,34 +91,30 @@ data:
|
||||
|
||||
lightning:
|
||||
trainer:
|
||||
accelerator: 'gpu'
|
||||
devices: 4
|
||||
accelerator: 'gpu'
|
||||
devices: 1
|
||||
log_gpu_memory: all
|
||||
max_epochs: 2
|
||||
precision: 16
|
||||
auto_select_gpus: False
|
||||
strategy:
|
||||
target: lightning.pytorch.strategies.ColossalAIStrategy
|
||||
target: strategies.ColossalAIStrategy
|
||||
params:
|
||||
use_chunk: False
|
||||
enable_distributed_storage: True,
|
||||
placement_policy: cuda
|
||||
force_outputs_fp32: False
|
||||
initial_scale: 65536
|
||||
min_scale: 1
|
||||
max_scale: 65536
|
||||
# max_scale: 4294967296
|
||||
use_chunk: True
|
||||
enable_distributed_storage: True
|
||||
placement_policy: auto
|
||||
force_outputs_fp32: true
|
||||
|
||||
log_every_n_steps: 2
|
||||
logger: True
|
||||
default_root_dir: "/tmp/diff_log/"
|
||||
profiler: pytorch
|
||||
# profiler: pytorch
|
||||
|
||||
logger_config:
|
||||
wandb:
|
||||
target: lightning.pytorch.loggers.WandbLogger
|
||||
target: loggers.WandbLogger
|
||||
params:
|
||||
name: nowname
|
||||
save_dir: "/tmp/diff_log/"
|
||||
offline: opt.debug
|
||||
id: nowname
|
||||
id: nowname
|
||||
|
Reference in New Issue
Block a user