diff --git a/examples/images/diffusion/configs/Inference/v2-inference-v.yaml b/examples/images/diffusion/configs/Inference/v2-inference-v.yaml index 8ec8dfbfe..b05955d3f 100644 --- a/examples/images/diffusion/configs/Inference/v2-inference-v.yaml +++ b/examples/images/diffusion/configs/Inference/v2-inference-v.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: parameterization: "v" linear_start: 0.00085 @@ -19,50 +18,42 @@ model: use_ema: False # we set this to false because this is an inference only config unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" diff --git a/examples/images/diffusion/configs/Inference/v2-inference.yaml b/examples/images/diffusion/configs/Inference/v2-inference.yaml index 152c4f3c2..5d8d583d0 100644 --- a/examples/images/diffusion/configs/Inference/v2-inference.yaml +++ b/examples/images/diffusion/configs/Inference/v2-inference.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.0120 @@ -18,50 +17,42 @@ model: use_ema: False # we set this to false because this is an inference only config unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" diff --git a/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml b/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml index 32a9471d7..ffaa5e8da 100644 --- a/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml +++ b/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml @@ -19,106 +19,97 @@ model: use_ema: False unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - image_size: 32 # unused - in_channels: 9 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + image_size: 32 # unused + in_channels: 9 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" data: - target: ldm.data.laion.WebDataModuleFromConfig - params: - tar_base: null # for concat as in LAION-A - p_unsafe_threshold: 0.1 - filter_word_list: "data/filters.yaml" - max_pwatermark: 0.45 - batch_size: 8 - num_workers: 6 - multinode: True - min_size: 512 - train: - shards: - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" - - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" - shuffle: 10000 - image_key: jpg - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - interpolation: 3 - - target: torchvision.transforms.RandomCrop - params: - size: 512 - postprocess: - target: ldm.data.laion.AddMask - params: - mode: "512train-large" - p_drop: 0.25 - # NOTE use enough shards to avoid empty validation loops in workers - validation: - shards: - - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " - shuffle: 0 - image_key: jpg - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - interpolation: 3 - - target: torchvision.transforms.CenterCrop - params: - size: 512 - postprocess: - target: ldm.data.laion.AddMask - params: - mode: "512train-large" - p_drop: 0.25 + tar_base: null # for concat as in LAION-A + p_unsafe_threshold: 0.1 + filter_word_list: "data/filters.yaml" + max_pwatermark: 0.45 + batch_size: 8 + num_workers: 6 + multinode: True + min_size: 512 + train: + shards: + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" + shuffle: 10000 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + # NOTE use enough shards to avoid empty validation loops in workers + validation: + shards: + - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " + shuffle: 0 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.CenterCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 lightning: find_unused_parameters: True @@ -132,8 +123,6 @@ lightning: every_n_train_steps: 10000 image_logger: - target: main.ImageLogger - params: enable_autocast: False disabled: False batch_frequency: 1000 diff --git a/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml b/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml index 531199de4..01d3729f1 100644 --- a/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml +++ b/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml @@ -19,54 +19,45 @@ model: use_ema: False depth_stage_config: - target: ldm.modules.midas.api.MiDaSInference - params: - model_type: "dpt_hybrid" + model_type: "dpt_hybrid" unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - image_size: 32 # unused - in_channels: 5 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + image_size: 32 # unused + in_channels: 5 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" diff --git a/examples/images/diffusion/configs/Inference/x4-upscaling.yaml b/examples/images/diffusion/configs/Inference/x4-upscaling.yaml index 45ecbf9ad..426d387ca 100644 --- a/examples/images/diffusion/configs/Inference/x4-upscaling.yaml +++ b/examples/images/diffusion/configs/Inference/x4-upscaling.yaml @@ -20,56 +20,47 @@ model: use_ema: False low_scale_config: - target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation - params: - noise_schedule_config: # image space - linear_start: 0.0001 - linear_end: 0.02 - max_noise_level: 350 + noise_schedule_config: # image space + linear_start: 0.0001 + linear_end: 0.02 + max_noise_level: 350 unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - num_classes: 1000 # timesteps for noise conditioning (here constant, just need one) - image_size: 128 - in_channels: 7 - out_channels: 4 - model_channels: 256 - attention_resolutions: [ 2,4,8] - num_res_blocks: 2 - channel_mult: [ 1, 2, 2, 4] - disable_self_attentions: [True, True, True, False] - disable_middle_self_attn: False - num_heads: 8 - use_spatial_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False - use_linear_in_transformer: True + use_checkpoint: True + num_classes: 1000 # timesteps for noise conditioning (here constant, just need one) + image_size: 128 + in_channels: 7 + out_channels: 4 + model_channels: 256 + attention_resolutions: [ 2,4,8] + num_res_blocks: 2 + channel_mult: [ 1, 2, 2, 4] + disable_self_attentions: [True, True, True, False] + disable_middle_self_attn: False + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + use_linear_in_transformer: True first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - ddconfig: - # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though) - double_z: True - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 - num_res_blocks: 2 - attn_resolutions: [ ] - dropout: 0.0 + embed_dim: 4 + ddconfig: + # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though) + double_z: True + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: - lossconfig: - target: torch.nn.Identity cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" diff --git a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml index ff0f4c5a0..9e760124c 100644 --- a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml +++ b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: parameterization: "v" linear_start: 0.00085 @@ -20,81 +19,70 @@ model: use_ema: False scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch - cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases - f_start: [ 1.e-6 ] - f_max: [ 1.e-4 ] - f_min: [ 1.e-10 ] + warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1.e-4 ] + f_min: [ 1.e-10 ] unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" data: - target: main.DataModuleFromConfig - params: - batch_size: 16 - num_workers: 4 - train: - target: ldm.data.teyvat.hf_dataset - params: - path: Fazzie/Teyvat - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - - target: torchvision.transforms.RandomCrop - params: - size: 512 - - target: torchvision.transforms.RandomHorizontalFlip + batch_size: 16 + num_workers: 4 + train: + target: ldm.data.teyvat.hf_dataset + params: + path: Fazzie/Teyvat + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + - target: torchvision.transforms.RandomHorizontalFlip lightning: trainer: @@ -105,13 +93,11 @@ lightning: precision: 16 auto_select_gpus: False strategy: - target: strategies.ColossalAIStrategy - params: - use_chunk: True - enable_distributed_storage: True - placement_policy: cuda - force_outputs_fp32: true - min_chunk_size: 64 + use_chunk: True + enable_distributed_storage: True + placement_policy: cuda + force_outputs_fp32: true + min_chunk_size: 64 log_every_n_steps: 2 logger: True @@ -120,9 +106,7 @@ lightning: logger_config: wandb: - target: loggers.WandbLogger - params: - name: nowname - save_dir: "/tmp/diff_log/" - offline: opt.debug - id: nowname + name: nowname + save_dir: "/tmp/diff_log/" + offline: opt.debug + id: nowname diff --git a/examples/images/diffusion/configs/train_colossalai.yaml b/examples/images/diffusion/configs/train_colossalai.yaml index 88432e978..5f745286a 100644 --- a/examples/images/diffusion/configs/train_colossalai.yaml +++ b/examples/images/diffusion/configs/train_colossalai.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: parameterization: "v" linear_start: 0.00085 @@ -19,95 +18,83 @@ model: use_ema: False # we set this to false because this is an inference only config scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch - cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases - f_start: [ 1.e-6 ] - f_max: [ 1.e-4 ] - f_min: [ 1.e-10 ] + warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1.e-4 ] + f_min: [ 1.e-10 ] unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" data: - target: main.DataModuleFromConfig - params: - batch_size: 128 - wrap: False - # num_workwers should be 2 * batch_size, and total num less than 1024 - # e.g. if use 8 devices, no more than 128 - num_workers: 128 - train: - target: ldm.data.base.Txt2ImgIterableBaseDataset - params: - file_path: # YOUR DATASET_PATH - world_size: 1 - rank: 0 + batch_size: 128 + wrap: False + # num_workwers should be 2 * batch_size, and total num less than 1024 + # e.g. if use 8 devices, no more than 128 + num_workers: 128 + train: + target: ldm.data.base.Txt2ImgIterableBaseDataset + params: + file_path: # YOUR DATASET_PATH + world_size: 1 + rank: 0 lightning: trainer: accelerator: 'gpu' - devices: 8 + devices: 2 log_gpu_memory: all max_epochs: 2 precision: 16 auto_select_gpus: False strategy: - target: strategies.ColossalAIStrategy - params: - use_chunk: True - enable_distributed_storage: True - placement_policy: cuda - force_outputs_fp32: true - min_chunk_size: 64 + use_chunk: True + enable_distributed_storage: True + placement_policy: cuda + force_outputs_fp32: true + min_chunk_size: 64 log_every_n_steps: 2 logger: True @@ -116,9 +103,7 @@ lightning: logger_config: wandb: - target: loggers.WandbLogger - params: - name: nowname - save_dir: "/tmp/diff_log/" - offline: opt.debug - id: nowname + name: nowname + save_dir: "/tmp/diff_log/" + offline: opt.debug + id: nowname diff --git a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml index 0ba06f832..0d0f18542 100644 --- a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml +++ b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: parameterization: "v" linear_start: 0.00085 @@ -19,82 +18,71 @@ model: use_ema: False # we set this to false because this is an inference only config scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch - cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases - f_start: [ 1.e-6 ] - f_max: [ 1.e-4 ] - f_min: [ 1.e-10 ] + warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1.e-4 ] + f_min: [ 1.e-10 ] unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" data: - target: main.DataModuleFromConfig - params: - batch_size: 4 - num_workers: 4 - train: - target: ldm.data.cifar10.hf_dataset - params: - name: cifar10 - image_transforms: - - target: torchvision.transforms.Resize - params: - size: 512 - interpolation: 3 - - target: torchvision.transforms.RandomCrop - params: - size: 512 - - target: torchvision.transforms.RandomHorizontalFlip + batch_size: 4 + num_workers: 4 + train: + target: ldm.data.cifar10.hf_dataset + params: + name: cifar10 + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + - target: torchvision.transforms.RandomHorizontalFlip lightning: trainer: @@ -105,13 +93,11 @@ lightning: precision: 16 auto_select_gpus: False strategy: - target: strategies.ColossalAIStrategy - params: - use_chunk: True - enable_distributed_storage: True - placement_policy: cuda - force_outputs_fp32: true - min_chunk_size: 64 + use_chunk: True + enable_distributed_storage: True + placement_policy: cuda + force_outputs_fp32: true + min_chunk_size: 64 log_every_n_steps: 2 logger: True @@ -120,9 +106,7 @@ lightning: logger_config: wandb: - target: loggers.WandbLogger - params: - name: nowname - save_dir: "/tmp/diff_log/" - offline: opt.debug - id: nowname + name: nowname + save_dir: "/tmp/diff_log/" + offline: opt.debug + id: nowname diff --git a/examples/images/diffusion/configs/train_ddp.yaml b/examples/images/diffusion/configs/train_ddp.yaml index a63df887e..f3ae3ddb5 100644 --- a/examples/images/diffusion/configs/train_ddp.yaml +++ b/examples/images/diffusion/configs/train_ddp.yaml @@ -1,6 +1,5 @@ model: base_learning_rate: 1.0e-4 - target: ldm.models.diffusion.ddpm.LatentDiffusion params: parameterization: "v" linear_start: 0.00085 @@ -19,77 +18,65 @@ model: use_ema: False # we set this to false because this is an inference only config scheduler_config: # 10000 warmup steps - target: ldm.lr_scheduler.LambdaLinearScheduler - params: - warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch - cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases - f_start: [ 1.e-6 ] - f_max: [ 1.e-4 ] - f_min: [ 1.e-10 ] + warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1.e-4 ] + f_min: [ 1.e-10 ] unet_config: - target: ldm.modules.diffusionmodules.openaimodel.UNetModel - params: - use_checkpoint: True - use_fp16: True - image_size: 32 # unused - in_channels: 4 - out_channels: 4 - model_channels: 320 - attention_resolutions: [ 4, 2, 1 ] - num_res_blocks: 2 - channel_mult: [ 1, 2, 4, 4 ] - num_head_channels: 64 # need to fix for flash-attn - use_spatial_transformer: True - use_linear_in_transformer: True - transformer_depth: 1 - context_dim: 1024 - legacy: False + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False first_stage_config: - target: ldm.models.autoencoder.AutoencoderKL - params: - embed_dim: 4 - monitor: val/rec_loss - ddconfig: - #attn_type: "vanilla-xformers" - double_z: true - z_channels: 4 - resolution: 256 - in_channels: 3 - out_ch: 3 - ch: 128 - ch_mult: - - 1 - - 2 - - 4 - - 4 - num_res_blocks: 2 - attn_resolutions: [] - dropout: 0.0 - lossconfig: - target: torch.nn.Identity + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 cond_stage_config: - target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder - params: - freeze: True - layer: "penultimate" + freeze: True + layer: "penultimate" data: - target: main.DataModuleFromConfig - params: - batch_size: 128 - # num_workwers should be 2 * batch_size, and the total num less than 1024 - # e.g. if use 8 devices, no more than 128 - num_workers: 128 - train: - target: ldm.data.base.Txt2ImgIterableBaseDataset - params: - file_path: # YOUR DATAPATH - world_size: 1 - rank: 0 + batch_size: 128 + # num_workwers should be 2 * batch_size, and the total num less than 1024 + # e.g. if use 8 devices, no more than 128 + num_workers: 128 + train: + target: ldm.data.base.Txt2ImgIterableBaseDataset + params: + file_path: # YOUR DATAPATH + world_size: 1 + rank: 0 lightning: trainer: @@ -100,9 +87,7 @@ lightning: precision: 16 auto_select_gpus: False strategy: - target: strategies.DDPStrategy - params: - find_unused_parameters: False + find_unused_parameters: False log_every_n_steps: 2 # max_steps: 6o logger: True @@ -111,9 +96,7 @@ lightning: logger_config: wandb: - target: loggers.WandbLogger - params: - name: nowname - save_dir: "/data2/tmp/diff_log/" - offline: opt.debug - id: nowname + name: nowname + save_dir: "/data2/tmp/diff_log/" + offline: opt.debug + id: nowname diff --git a/examples/images/diffusion/ldm/models/autoencoder.py b/examples/images/diffusion/ldm/models/autoencoder.py index b1bd83778..f0a69fe63 100644 --- a/examples/images/diffusion/ldm/models/autoencoder.py +++ b/examples/images/diffusion/ldm/models/autoencoder.py @@ -1,16 +1,13 @@ import torch -try: - import lightning.pytorch as pl -except: - import pytorch_lightning as pl +import lightning.pytorch as pl -import torch.nn.functional as F +from torch import nn +from torch.nn import functional as F +from torch.nn import Identity from contextlib import contextmanager from ldm.modules.diffusionmodules.model import Encoder, Decoder from ldm.modules.distributions.distributions import DiagonalGaussianDistribution - -from ldm.util import instantiate_from_config from ldm.modules.ema import LitEma @@ -32,7 +29,7 @@ class AutoencoderKL(pl.LightningModule): self.image_key = image_key self.encoder = Encoder(**ddconfig) self.decoder = Decoder(**ddconfig) - self.loss = instantiate_from_config(lossconfig) + self.loss = Identity() assert ddconfig["double_z"] self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1) self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1) diff --git a/examples/images/diffusion/ldm/models/diffusion/classifier.py b/examples/images/diffusion/ldm/models/diffusion/classifier.py index 612a8371b..3cf12f093 100644 --- a/examples/images/diffusion/ldm/models/diffusion/classifier.py +++ b/examples/images/diffusion/ldm/models/diffusion/classifier.py @@ -9,9 +9,10 @@ from copy import deepcopy from einops import rearrange from glob import glob from natsort import natsorted - +from ldm.models.diffusion.ddpm import LatentDiffusion +from ldm.lr_scheduler import LambdaLinearScheduler from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel -from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config +from ldm.util import log_txt_as_img, default, ismap __models__ = { 'class_label': EncoderUNetModel, @@ -86,7 +87,7 @@ class NoisyLatentImageClassifier(pl.LightningModule): print(f"Unexpected Keys: {unexpected}") def load_diffusion(self): - model = instantiate_from_config(self.diffusion_config) + model = LatentDiffusion(**self.diffusion_config.get('params',dict())) self.diffusion_model = model.eval() self.diffusion_model.train = disabled_train for param in self.diffusion_model.parameters(): @@ -221,7 +222,7 @@ class NoisyLatentImageClassifier(pl.LightningModule): optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) if self.use_scheduler: - scheduler = instantiate_from_config(self.scheduler_config) + scheduler = LambdaLinearScheduler(**self.scheduler_config.get('params',dict())) print("Setting up LambdaLR scheduler...") scheduler = [ diff --git a/examples/images/diffusion/ldm/models/diffusion/ddpm.py b/examples/images/diffusion/ldm/models/diffusion/ddpm.py index b7315b048..842ec1371 100644 --- a/examples/images/diffusion/ldm/models/diffusion/ddpm.py +++ b/examples/images/diffusion/ldm/models/diffusion/ddpm.py @@ -22,19 +22,22 @@ from contextlib import contextmanager, nullcontext from functools import partial from einops import rearrange, repeat +from ldm.lr_scheduler import LambdaLinearScheduler from ldm.models.autoencoder import * from ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage from ldm.models.diffusion.ddim import * from ldm.models.diffusion.ddim import DDIMSampler +from ldm.modules.midas.api import MiDaSInference from ldm.modules.diffusionmodules.model import * from ldm.modules.diffusionmodules.model import Decoder, Encoder, Model from ldm.modules.diffusionmodules.openaimodel import * -from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d +from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d, UNetModel from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule, noise_like from ldm.modules.distributions.distributions import DiagonalGaussianDistribution, normal_kl +from ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation from ldm.modules.ema import LitEma from ldm.modules.encoders.modules import * -from ldm.util import count_params, default, exists, instantiate_from_config, isimage, ismap, log_txt_as_img, mean_flat +from ldm.util import count_params, default, exists, isimage, ismap, log_txt_as_img, mean_flat from omegaconf import ListConfig from torch.optim.lr_scheduler import LambdaLR from torchvision.utils import make_grid @@ -690,7 +693,7 @@ class LatentDiffusion(DDPM): self.make_cond_schedule() def instantiate_first_stage(self, config): - model = instantiate_from_config(config) + model = AutoencoderKL(**config) self.first_stage_model = model.eval() self.first_stage_model.train = disabled_train for param in self.first_stage_model.parameters(): @@ -706,15 +709,13 @@ class LatentDiffusion(DDPM): self.cond_stage_model = None # self.be_unconditional = True else: - model = instantiate_from_config(config) + model = FrozenOpenCLIPEmbedder(**config) self.cond_stage_model = model.eval() self.cond_stage_model.train = disabled_train for param in self.cond_stage_model.parameters(): param.requires_grad = False else: - assert config != '__is_first_stage__' - assert config != '__is_unconditional__' - model = instantiate_from_config(config) + model = FrozenOpenCLIPEmbedder(**config) self.cond_stage_model = model def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False): @@ -1479,8 +1480,7 @@ class LatentDiffusion(DDPM): # opt = torch.optim.AdamW(params, lr=lr) if self.use_scheduler: - assert 'target' in self.scheduler_config - scheduler = instantiate_from_config(self.scheduler_config) + scheduler = LambdaLinearScheduler(**self.scheduler_config) rank_zero_info("Setting up LambdaLR scheduler...") scheduler = [{'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule), 'interval': 'step', 'frequency': 1}] @@ -1502,7 +1502,7 @@ class DiffusionWrapper(pl.LightningModule): def __init__(self, diff_model_config, conditioning_key): super().__init__() self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False) - self.diffusion_model = instantiate_from_config(diff_model_config) + self.diffusion_model = UNetModel(**diff_model_config) self.conditioning_key = conditioning_key assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm'] @@ -1551,7 +1551,7 @@ class LatentUpscaleDiffusion(LatentDiffusion): self.noise_level_key = noise_level_key def instantiate_low_stage(self, config): - model = instantiate_from_config(config) + model = ImageConcatWithNoiseAugmentation(**config) self.low_scale_model = model.eval() self.low_scale_model.train = disabled_train for param in self.low_scale_model.parameters(): @@ -1933,7 +1933,7 @@ class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion): def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs): super().__init__(concat_keys=concat_keys, *args, **kwargs) - self.depth_model = instantiate_from_config(depth_stage_config) + self.depth_model = MiDaSInference(**depth_stage_config) self.depth_stage_key = concat_keys[0] @torch.no_grad() @@ -2006,7 +2006,7 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion): self.low_scale_key = low_scale_key def instantiate_low_stage(self, config): - model = instantiate_from_config(config) + model = ImageConcatWithNoiseAugmentation(**config) self.low_scale_model = model.eval() self.low_scale_model.train = disabled_train for param in self.low_scale_model.parameters(): diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py index 91b809d5a..e31d75e08 100644 --- a/examples/images/diffusion/main.py +++ b/examples/images/diffusion/main.py @@ -10,11 +10,8 @@ import time import numpy as np import torch import torchvision +import lightning.pytorch as pl -try: - import lightning.pytorch as pl -except: - import pytorch_lightning as pl from functools import partial @@ -23,19 +20,15 @@ from packaging import version from PIL import Image from prefetch_generator import BackgroundGenerator from torch.utils.data import DataLoader, Dataset, Subset, random_split +from ldm.models.diffusion.ddpm import LatentDiffusion -try: - from lightning.pytorch import seed_everything - from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint - from lightning.pytorch.trainer import Trainer - from lightning.pytorch.utilities import rank_zero_info, rank_zero_only - LIGHTNING_PACK_NAME = "lightning.pytorch." -except: - from pytorch_lightning import seed_everything - from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint - from pytorch_lightning.trainer import Trainer - from pytorch_lightning.utilities import rank_zero_info, rank_zero_only - LIGHTNING_PACK_NAME = "pytorch_lightning." +from lightning.pytorch import seed_everything +from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint +from lightning.pytorch.trainer import Trainer +from lightning.pytorch.utilities import rank_zero_info, rank_zero_only +from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger +from lightning.pytorch.strategies import ColossalAIStrategy,DDPStrategy +LIGHTNING_PACK_NAME = "lightning.pytorch." from ldm.data.base import Txt2ImgIterableBaseDataset from ldm.util import instantiate_from_config @@ -687,153 +680,114 @@ if __name__ == "__main__": config.model["params"].update({"ckpt": ckpt}) rank_zero_info("Using ckpt_path = {}".format(config.model["params"]["ckpt"])) - model = instantiate_from_config(config.model) + model = LatentDiffusion(**config.model.get("params", dict())) # trainer and callbacks trainer_kwargs = dict() # config the logger # Default logger configs to log training metrics during the training process. - # These loggers are specified as targets in the dictionary, along with the configuration settings specific to each logger. default_logger_cfgs = { "wandb": { - "target": LIGHTNING_PACK_NAME + "loggers.WandbLogger", - "params": { "name": nowname, "save_dir": logdir, "offline": opt.debug, "id": nowname, } - }, + , "tensorboard": { - "target": LIGHTNING_PACK_NAME + "loggers.TensorBoardLogger", - "params": { "save_dir": logdir, "name": "diff_tb", "log_graph": True } - } } # Set up the logger for TensorBoard default_logger_cfg = default_logger_cfgs["tensorboard"] if "logger" in lightning_config: logger_cfg = lightning_config.logger + trainer_kwargs["logger"] = WandbLogger(**logger_cfg) else: logger_cfg = default_logger_cfg - logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg) - trainer_kwargs["logger"] = instantiate_from_config(logger_cfg) + trainer_kwargs["logger"] = TensorBoardLogger(**logger_cfg) # config the strategy, defualt is ddp if "strategy" in trainer_config: strategy_cfg = trainer_config["strategy"] - strategy_cfg["target"] = LIGHTNING_PACK_NAME + strategy_cfg["target"] + trainer_kwargs["strategy"] = ColossalAIStrategy(**strategy_cfg) else: - strategy_cfg = { - "target": LIGHTNING_PACK_NAME + "strategies.DDPStrategy", - "params": { - "find_unused_parameters": False - } - } - - trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg) + strategy_cfg = {"find_unused_parameters": False} + trainer_kwargs["strategy"] = DDPStrategy(**strategy_cfg) # Set up ModelCheckpoint callback to save best models # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to # specify which metric is used to determine best models default_modelckpt_cfg = { - "target": LIGHTNING_PACK_NAME + "callbacks.ModelCheckpoint", - "params": { "dirpath": ckptdir, "filename": "{epoch:06}", "verbose": True, "save_last": True, } - } if hasattr(model, "monitor"): - default_modelckpt_cfg["params"]["monitor"] = model.monitor - default_modelckpt_cfg["params"]["save_top_k"] = 3 + default_modelckpt_cfg["monitor"] = model.monitor + default_modelckpt_cfg["save_top_k"] = 3 if "modelcheckpoint" in lightning_config: - modelckpt_cfg = lightning_config.modelcheckpoint + modelckpt_cfg = lightning_config.modelcheckpoint["params"] else: modelckpt_cfg = OmegaConf.create() modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg) if version.parse(pl.__version__) < version.parse('1.4.0'): - trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg) + trainer_kwargs["checkpoint_callback"] = ModelCheckpoint(**modelckpt_cfg) - # Set up various callbacks, including logging, learning rate monitoring, and CUDA management - # add callback which sets up log directory - default_callbacks_cfg = { - "setup_callback": { # callback to set up the training - "target": "main.SetupCallback", - "params": { - "resume": opt.resume, # resume training if applicable - "now": now, - "logdir": logdir, # directory to save the log file - "ckptdir": ckptdir, # directory to save the checkpoint file - "cfgdir": cfgdir, # directory to save the configuration file - "config": config, # configuration dictionary - "lightning_config": lightning_config, # LightningModule configuration - } - }, - "image_logger": { # callback to log image data - "target": "main.ImageLogger", - "params": { - "batch_frequency": 750, # how frequently to log images - "max_images": 4, # maximum number of images to log - "clamp": True # whether to clamp pixel values to [0,1] - } - }, - "learning_rate_logger": { # callback to log learning rate - "target": "main.LearningRateMonitor", - "params": { - "logging_interval": "step", # logging frequency (either 'step' or 'epoch') - # "log_momentum": True # whether to log momentum (currently commented out) - } - }, - "cuda_callback": { # callback to handle CUDA-related operations - "target": "main.CUDACallback" - }, - } + #Create an empty OmegaConf configuration object - # If the LightningModule configuration has specified callbacks, use those - # Otherwise, create an empty OmegaConf configuration object - if "callbacks" in lightning_config: - callbacks_cfg = lightning_config.callbacks - else: - callbacks_cfg = OmegaConf.create() - - # If the 'metrics_over_trainsteps_checkpoint' callback is specified in the - # LightningModule configuration, update the default callbacks configuration - if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg: - print( - 'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.') - default_metrics_over_trainsteps_ckpt_dict = { - 'metrics_over_trainsteps_checkpoint': { - "target": LIGHTNING_PACK_NAME + 'callbacks.ModelCheckpoint', - 'params': { - "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'), - "filename": "{epoch:06}-{step:09}", - "verbose": True, - 'save_top_k': -1, - 'every_n_train_steps': 10000, - 'save_weights_only': True - } - } + callbacks_cfg = OmegaConf.create() + + #Instantiate items according to the configs + trainer_kwargs.setdefault("callbacks", []) + setup_callback_config = { + "resume": opt.resume, # resume training if applicable + "now": now, + "logdir": logdir, # directory to save the log file + "ckptdir": ckptdir, # directory to save the checkpoint file + "cfgdir": cfgdir, # directory to save the configuration file + "config": config, # configuration dictionary + "lightning_config": lightning_config, # LightningModule configuration } - default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict) + trainer_kwargs["callbacks"].append(SetupCallback(**setup_callback_config)) - # Merge the default callbacks configuration with the specified callbacks configuration, and instantiate the callbacks - callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg) - - trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg] + image_logger_config = { + + "batch_frequency": 750, # how frequently to log images + "max_images": 4, # maximum number of images to log + "clamp": True # whether to clamp pixel values to [0,1] + } + trainer_kwargs["callbacks"].append(ImageLogger(**image_logger_config)) + + learning_rate_logger_config = { + "logging_interval": "step", # logging frequency (either 'step' or 'epoch') + # "log_momentum": True # whether to log momentum (currently commented out) + } + trainer_kwargs["callbacks"].append(LearningRateMonitor(**learning_rate_logger_config)) + + metrics_over_trainsteps_checkpoint_config= { + "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'), + "filename": "{epoch:06}-{step:09}", + "verbose": True, + 'save_top_k': -1, + 'every_n_train_steps': 10000, + 'save_weights_only': True + } + trainer_kwargs["callbacks"].append(ModelCheckpoint(**metrics_over_trainsteps_checkpoint_config)) + trainer_kwargs["callbacks"].append(CUDACallback()) # Create a Trainer object with the specified command-line arguments and keyword arguments, and set the log directory trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs) trainer.logdir = logdir # Create a data module based on the configuration file - data = instantiate_from_config(config.data) + data = DataModuleFromConfig(**config.data) + # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html # calling these ourselves should not be necessary but it is. # lightning still takes care of proper multiprocessing though @@ -846,7 +800,7 @@ if __name__ == "__main__": # Configure learning rate based on the batch size, base learning rate and number of GPUs # If scale_lr is true, calculate the learning rate based on additional factors - bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate + bs, base_lr = config.data.batch_size, config.model.base_learning_rate if not cpu: ngpu = trainer_config["devices"] else: diff --git a/examples/images/diffusion/scripts/tests/test_checkpoint.py b/examples/images/diffusion/scripts/tests/test_checkpoint.py index a32e66d44..13622c498 100644 --- a/examples/images/diffusion/scripts/tests/test_checkpoint.py +++ b/examples/images/diffusion/scripts/tests/test_checkpoint.py @@ -7,8 +7,9 @@ from datetime import datetime from diffusers import StableDiffusionPipeline import torch -from ldm.util import instantiate_from_config + from main import get_parser +from ldm.modules.diffusionmodules.openaimodel import UNetModel if __name__ == "__main__": with torch.no_grad(): @@ -17,7 +18,7 @@ if __name__ == "__main__": config = f.read() base_config = yaml.load(config, Loader=yaml.FullLoader) unet_config = base_config['model']['params']['unet_config'] - diffusion_model = instantiate_from_config(unet_config).to("cuda:0") + diffusion_model = UNetModel(**unet_config).to("cuda:0") pipe = StableDiffusionPipeline.from_pretrained( "/data/scratch/diffuser/stable-diffusion-v1-4" diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh index c56ed7876..7f1a1bd14 100755 --- a/examples/images/diffusion/train_colossalai.sh +++ b/examples/images/diffusion/train_colossalai.sh @@ -3,3 +3,4 @@ TRANSFORMERS_OFFLINE=1 DIFFUSERS_OFFLINE=1 python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt diffuser_root_dir/512-base-ema.ckpt +