diff --git a/examples/images/diffusion/configs/Inference/v2-inference-v.yaml b/examples/images/diffusion/configs/Inference/v2-inference-v.yaml
index 8ec8dfbfe..b05955d3f 100644
--- a/examples/images/diffusion/configs/Inference/v2-inference-v.yaml
+++ b/examples/images/diffusion/configs/Inference/v2-inference-v.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -19,50 +18,42 @@ model:
     use_ema: False # we set this to false because this is an inference only config
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
diff --git a/examples/images/diffusion/configs/Inference/v2-inference.yaml b/examples/images/diffusion/configs/Inference/v2-inference.yaml
index 152c4f3c2..5d8d583d0 100644
--- a/examples/images/diffusion/configs/Inference/v2-inference.yaml
+++ b/examples/images/diffusion/configs/Inference/v2-inference.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     linear_start: 0.00085
     linear_end: 0.0120
@@ -18,50 +17,42 @@ model:
     use_ema: False # we set this to false because this is an inference only config
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
diff --git a/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml b/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml
index 32a9471d7..ffaa5e8da 100644
--- a/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml
+++ b/examples/images/diffusion/configs/Inference/v2-inpainting-inference.yaml
@@ -19,106 +19,97 @@ model:
     use_ema: False
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 9
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      image_size: 32 # unused
+      in_channels: 9
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-            - 1
-            - 2
-            - 4
-            - 4
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+        num_res_blocks: 2
+        attn_resolutions: [ ]
+        dropout: 0.0
+      lossconfig:
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
 
 
 data:
-  target: ldm.data.laion.WebDataModuleFromConfig
-  params:
-    tar_base: null  # for concat as in LAION-A
-    p_unsafe_threshold: 0.1
-    filter_word_list: "data/filters.yaml"
-    max_pwatermark: 0.45
-    batch_size: 8
-    num_workers: 6
-    multinode: True
-    min_size: 512
-    train:
-      shards:
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
-        - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -"  #{00000-94333}.tar"
-      shuffle: 10000
-      image_key: jpg
-      image_transforms:
-      - target: torchvision.transforms.Resize
-        params:
-          size: 512
-          interpolation: 3
-      - target: torchvision.transforms.RandomCrop
-        params:
-          size: 512
-      postprocess:
-        target: ldm.data.laion.AddMask
-        params:
-          mode: "512train-large"
-          p_drop: 0.25
-    # NOTE use enough shards to avoid empty validation loops in workers
-    validation:
-      shards:
-        - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
-      shuffle: 0
-      image_key: jpg
-      image_transforms:
-      - target: torchvision.transforms.Resize
-        params:
-          size: 512
-          interpolation: 3
-      - target: torchvision.transforms.CenterCrop
-        params:
-          size: 512
-      postprocess:
-        target: ldm.data.laion.AddMask
-        params:
-          mode: "512train-large"
-          p_drop: 0.25
+  tar_base: null  # for concat as in LAION-A
+  p_unsafe_threshold: 0.1
+  filter_word_list: "data/filters.yaml"
+  max_pwatermark: 0.45
+  batch_size: 8
+  num_workers: 6
+  multinode: True
+  min_size: 512
+  train:
+    shards:
+      - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
+      - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
+      - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
+      - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
+      - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -"  #{00000-94333}.tar"
+    shuffle: 10000
+    image_key: jpg
+    image_transforms:
+    - target: torchvision.transforms.Resize
+      params:
+        size: 512
+        interpolation: 3
+    - target: torchvision.transforms.RandomCrop
+      params:
+        size: 512
+    postprocess:
+      target: ldm.data.laion.AddMask
+      params:
+        mode: "512train-large"
+        p_drop: 0.25
+  # NOTE use enough shards to avoid empty validation loops in workers
+  validation:
+    shards:
+      - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
+    shuffle: 0
+    image_key: jpg
+    image_transforms:
+    - target: torchvision.transforms.Resize
+      params:
+        size: 512
+        interpolation: 3
+    - target: torchvision.transforms.CenterCrop
+      params:
+        size: 512
+    postprocess:
+      target: ldm.data.laion.AddMask
+      params:
+        mode: "512train-large"
+        p_drop: 0.25
 
 lightning:
   find_unused_parameters: True
@@ -132,8 +123,6 @@ lightning:
         every_n_train_steps: 10000
 
     image_logger:
-      target: main.ImageLogger
-      params:
         enable_autocast: False
         disabled: False
         batch_frequency: 1000
diff --git a/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml b/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml
index 531199de4..01d3729f1 100644
--- a/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml
+++ b/examples/images/diffusion/configs/Inference/v2-midas-inference.yaml
@@ -19,54 +19,45 @@ model:
     use_ema: False
 
     depth_stage_config:
-      target: ldm.modules.midas.api.MiDaSInference
-      params:
-        model_type: "dpt_hybrid"
+      model_type: "dpt_hybrid"
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 5
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      image_size: 32 # unused
+      in_channels: 5
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-            - 1
-            - 2
-            - 4
-            - 4
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+        num_res_blocks: 2
+        attn_resolutions: [ ]
+        dropout: 0.0
+      lossconfig:
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
diff --git a/examples/images/diffusion/configs/Inference/x4-upscaling.yaml b/examples/images/diffusion/configs/Inference/x4-upscaling.yaml
index 45ecbf9ad..426d387ca 100644
--- a/examples/images/diffusion/configs/Inference/x4-upscaling.yaml
+++ b/examples/images/diffusion/configs/Inference/x4-upscaling.yaml
@@ -20,56 +20,47 @@ model:
     use_ema: False
 
     low_scale_config:
-      target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
-      params:
-        noise_schedule_config: # image space
-          linear_start: 0.0001
-          linear_end: 0.02
-        max_noise_level: 350
+      noise_schedule_config: # image space
+        linear_start: 0.0001
+        linear_end: 0.02
+      max_noise_level: 350
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        num_classes: 1000  # timesteps for noise conditioning (here constant, just need one)
-        image_size: 128
-        in_channels: 7
-        out_channels: 4
-        model_channels: 256
-        attention_resolutions: [ 2,4,8]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 2, 4]
-        disable_self_attentions: [True, True, True, False]
-        disable_middle_self_attn: False
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-        use_linear_in_transformer: True
+      use_checkpoint: True
+      num_classes: 1000  # timesteps for noise conditioning (here constant, just need one)
+      image_size: 128
+      in_channels: 7
+      out_channels: 4
+      model_channels: 256
+      attention_resolutions: [ 2,4,8]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 2, 4]
+      disable_self_attentions: [True, True, True, False]
+      disable_middle_self_attn: False
+      num_heads: 8
+      use_spatial_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
+      use_linear_in_transformer: True
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        ddconfig:
-          # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
-          num_res_blocks: 2
-          attn_resolutions: [ ]
-          dropout: 0.0
+      embed_dim: 4
+      ddconfig:
+        # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
+        double_z: True
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
+        num_res_blocks: 2
+        attn_resolutions: [ ]
+        dropout: 0.0
+      lossconfig:
 
-        lossconfig:
-          target: torch.nn.Identity
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
diff --git a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
index ff0f4c5a0..9e760124c 100644
--- a/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
+++ b/examples/images/diffusion/configs/Teyvat/train_colossalai_teyvat.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -20,81 +19,70 @@ model:
     use_ema: False
 
     scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
+      warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
+      cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+      f_start: [ 1.e-6 ]
+      f_max: [ 1.e-4 ]
+      f_min: [ 1.e-10 ]
 
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
+      lossconfig:
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
 
 data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 16
-    num_workers: 4
-    train:
-      target: ldm.data.teyvat.hf_dataset
-      params:
-        path: Fazzie/Teyvat
-        image_transforms:
-        - target: torchvision.transforms.Resize
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomCrop
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomHorizontalFlip
+  batch_size: 16
+  num_workers: 4
+  train:
+    target: ldm.data.teyvat.hf_dataset
+    params:
+      path: Fazzie/Teyvat
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      - target: torchvision.transforms.RandomHorizontalFlip
 
 lightning:
   trainer:
@@ -105,13 +93,11 @@ lightning:
     precision: 16
     auto_select_gpus: False
     strategy:
-      target: strategies.ColossalAIStrategy
-      params:
-        use_chunk: True
-        enable_distributed_storage: True
-        placement_policy: cuda
-        force_outputs_fp32: true
-        min_chunk_size: 64
+      use_chunk: True
+      enable_distributed_storage: True
+      placement_policy: cuda
+      force_outputs_fp32: true
+      min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
@@ -120,9 +106,7 @@ lightning:
 
   logger_config:
     wandb:
-      target: loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
+      name: nowname
+      save_dir: "/tmp/diff_log/"
+      offline: opt.debug
+      id: nowname
diff --git a/examples/images/diffusion/configs/train_colossalai.yaml b/examples/images/diffusion/configs/train_colossalai.yaml
index 88432e978..5f745286a 100644
--- a/examples/images/diffusion/configs/train_colossalai.yaml
+++ b/examples/images/diffusion/configs/train_colossalai.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -19,95 +18,83 @@ model:
     use_ema: False # we set this to false because this is an inference only config
 
     scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
+      warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
+      cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+      f_start: [ 1.e-6 ]
+      f_max: [ 1.e-4 ]
+      f_min: [ 1.e-10 ]
 
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
+      lossconfig:
+
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
 
 data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 128
-    wrap: False
-    # num_workwers should be 2 * batch_size, and total num less than 1024
-    # e.g. if use 8 devices, no more than 128
-    num_workers: 128
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: # YOUR DATASET_PATH
-        world_size: 1
-        rank: 0
+  batch_size: 128
+  wrap: False
+  # num_workwers should be 2 * batch_size, and total num less than 1024
+  # e.g. if use 8 devices, no more than 128
+  num_workers: 128
+  train:
+    target: ldm.data.base.Txt2ImgIterableBaseDataset
+    params:
+      file_path: # YOUR DATASET_PATH
+      world_size: 1
+      rank: 0
 
 lightning:
   trainer:
     accelerator: 'gpu'
-    devices: 8
+    devices: 2
     log_gpu_memory: all
     max_epochs: 2
     precision: 16
     auto_select_gpus: False
     strategy:
-      target: strategies.ColossalAIStrategy
-      params:
-        use_chunk: True
-        enable_distributed_storage: True
-        placement_policy: cuda
-        force_outputs_fp32: true
-        min_chunk_size: 64
+      use_chunk: True
+      enable_distributed_storage: True
+      placement_policy: cuda
+      force_outputs_fp32: true
+      min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
@@ -116,9 +103,7 @@ lightning:
 
   logger_config:
     wandb:
-      target: loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
+      name: nowname
+      save_dir: "/tmp/diff_log/"
+      offline: opt.debug
+      id: nowname
diff --git a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
index 0ba06f832..0d0f18542 100644
--- a/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
+++ b/examples/images/diffusion/configs/train_colossalai_cifar10.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -19,82 +18,71 @@ model:
     use_ema: False # we set this to false because this is an inference only config
 
     scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
+      warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
+      cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+      f_start: [ 1.e-6 ]
+      f_max: [ 1.e-4 ]
+      f_min: [ 1.e-10 ]
 
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
+      lossconfig:
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
 
 data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 4
-    num_workers: 4
-    train:
-      target: ldm.data.cifar10.hf_dataset
-      params:
-        name: cifar10
-        image_transforms:
-        - target: torchvision.transforms.Resize
-          params:
-            size: 512
-            interpolation: 3
-        - target: torchvision.transforms.RandomCrop
-          params:
-            size: 512
-        - target: torchvision.transforms.RandomHorizontalFlip
+  batch_size: 4
+  num_workers: 4
+  train:
+    target: ldm.data.cifar10.hf_dataset
+    params:
+      name: cifar10
+      image_transforms:
+      - target: torchvision.transforms.Resize
+        params:
+          size: 512
+          interpolation: 3
+      - target: torchvision.transforms.RandomCrop
+        params:
+          size: 512
+      - target: torchvision.transforms.RandomHorizontalFlip
 
 lightning:
   trainer:
@@ -105,13 +93,11 @@ lightning:
     precision: 16
     auto_select_gpus: False
     strategy:
-      target: strategies.ColossalAIStrategy
-      params:
-        use_chunk: True
-        enable_distributed_storage: True
-        placement_policy: cuda
-        force_outputs_fp32: true
-        min_chunk_size: 64
+      use_chunk: True
+      enable_distributed_storage: True
+      placement_policy: cuda
+      force_outputs_fp32: true
+      min_chunk_size: 64
 
     log_every_n_steps: 2
     logger: True
@@ -120,9 +106,7 @@ lightning:
 
   logger_config:
     wandb:
-      target: loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
+        name: nowname
+        save_dir: "/tmp/diff_log/"
+        offline: opt.debug
+        id: nowname
diff --git a/examples/images/diffusion/configs/train_ddp.yaml b/examples/images/diffusion/configs/train_ddp.yaml
index a63df887e..f3ae3ddb5 100644
--- a/examples/images/diffusion/configs/train_ddp.yaml
+++ b/examples/images/diffusion/configs/train_ddp.yaml
@@ -1,6 +1,5 @@
 model:
   base_learning_rate: 1.0e-4
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -19,77 +18,65 @@ model:
     use_ema: False # we set this to false because this is an inference only config
 
     scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-4 ]
-        f_min: [ 1.e-10 ]
+      warm_up_steps: [ 1 ] # NOTE for resuming. use 10000 if starting from scratch
+      cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+      f_start: [ 1.e-6 ]
+      f_max: [ 1.e-4 ]
+      f_min: [ 1.e-10 ]
 
 
     unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        use_checkpoint: True
-        use_fp16: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
+      use_checkpoint: True
+      use_fp16: True
+      image_size: 32 # unused
+      in_channels: 4
+      out_channels: 4
+      model_channels: 320
+      attention_resolutions: [ 4, 2, 1 ]
+      num_res_blocks: 2
+      channel_mult: [ 1, 2, 4, 4 ]
+      num_head_channels: 64 # need to fix for flash-attn
+      use_spatial_transformer: True
+      use_linear_in_transformer: True
+      transformer_depth: 1
+      context_dim: 1024
+      legacy: False
 
     first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
+      embed_dim: 4
+      monitor: val/rec_loss
+      ddconfig:
+        #attn_type: "vanilla-xformers"
+        double_z: true
+        z_channels: 4
+        resolution: 256
+        in_channels: 3
+        out_ch: 3
+        ch: 128
+        ch_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_res_blocks: 2
+        attn_resolutions: []
+        dropout: 0.0
 
     cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
+      freeze: True
+      layer: "penultimate"
 
 data:
-  target: main.DataModuleFromConfig
-  params:
-    batch_size: 128
-    # num_workwers should be 2 * batch_size, and the total num less than 1024
-    # e.g. if use 8 devices, no more than 128
-    num_workers: 128
-    train:
-      target: ldm.data.base.Txt2ImgIterableBaseDataset
-      params:
-        file_path: # YOUR DATAPATH
-        world_size: 1
-        rank: 0
+  batch_size: 128
+  # num_workwers should be 2 * batch_size, and the total num less than 1024
+  # e.g. if use 8 devices, no more than 128
+  num_workers: 128
+  train:
+    target: ldm.data.base.Txt2ImgIterableBaseDataset
+    params:
+      file_path: # YOUR DATAPATH
+      world_size: 1
+      rank: 0
 
 lightning:
   trainer:
@@ -100,9 +87,7 @@ lightning:
     precision: 16
     auto_select_gpus: False
     strategy:
-      target: strategies.DDPStrategy
-      params:
-        find_unused_parameters: False
+      find_unused_parameters: False
     log_every_n_steps: 2
 #    max_steps: 6o
     logger: True
@@ -111,9 +96,7 @@ lightning:
 
   logger_config:
     wandb:
-      target: loggers.WandbLogger
-      params:
-          name: nowname
-          save_dir: "/data2/tmp/diff_log/"
-          offline: opt.debug
-          id: nowname
+      name: nowname
+      save_dir: "/data2/tmp/diff_log/"
+      offline: opt.debug
+      id: nowname
diff --git a/examples/images/diffusion/ldm/models/autoencoder.py b/examples/images/diffusion/ldm/models/autoencoder.py
index b1bd83778..f0a69fe63 100644
--- a/examples/images/diffusion/ldm/models/autoencoder.py
+++ b/examples/images/diffusion/ldm/models/autoencoder.py
@@ -1,16 +1,13 @@
 import torch
-try:
-    import lightning.pytorch as pl
-except:
-    import pytorch_lightning as pl
+import lightning.pytorch as pl
 
-import torch.nn.functional as F
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Identity
 from contextlib import contextmanager
 
 from ldm.modules.diffusionmodules.model import Encoder, Decoder
 from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
-
-from ldm.util import instantiate_from_config
 from ldm.modules.ema import LitEma
 
 
@@ -32,7 +29,7 @@ class AutoencoderKL(pl.LightningModule):
         self.image_key = image_key
         self.encoder = Encoder(**ddconfig)
         self.decoder = Decoder(**ddconfig)
-        self.loss = instantiate_from_config(lossconfig)
+        self.loss = Identity()
         assert ddconfig["double_z"]
         self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
         self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
diff --git a/examples/images/diffusion/ldm/models/diffusion/classifier.py b/examples/images/diffusion/ldm/models/diffusion/classifier.py
index 612a8371b..3cf12f093 100644
--- a/examples/images/diffusion/ldm/models/diffusion/classifier.py
+++ b/examples/images/diffusion/ldm/models/diffusion/classifier.py
@@ -9,9 +9,10 @@ from copy import deepcopy
 from einops import rearrange
 from glob import glob
 from natsort import natsorted
-
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.lr_scheduler import LambdaLinearScheduler
 from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
-from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
+from ldm.util import log_txt_as_img, default, ismap
 
 __models__ = {
     'class_label': EncoderUNetModel,
@@ -86,7 +87,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
             print(f"Unexpected Keys: {unexpected}")
 
     def load_diffusion(self):
-        model = instantiate_from_config(self.diffusion_config)
+        model = LatentDiffusion(**self.diffusion_config.get('params',dict()))
         self.diffusion_model = model.eval()
         self.diffusion_model.train = disabled_train
         for param in self.diffusion_model.parameters():
@@ -221,7 +222,7 @@ class NoisyLatentImageClassifier(pl.LightningModule):
         optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
 
         if self.use_scheduler:
-            scheduler = instantiate_from_config(self.scheduler_config)
+            scheduler = LambdaLinearScheduler(**self.scheduler_config.get('params',dict()))
 
             print("Setting up LambdaLR scheduler...")
             scheduler = [
diff --git a/examples/images/diffusion/ldm/models/diffusion/ddpm.py b/examples/images/diffusion/ldm/models/diffusion/ddpm.py
index b7315b048..842ec1371 100644
--- a/examples/images/diffusion/ldm/models/diffusion/ddpm.py
+++ b/examples/images/diffusion/ldm/models/diffusion/ddpm.py
@@ -22,19 +22,22 @@ from contextlib import contextmanager, nullcontext
 from functools import partial
 
 from einops import rearrange, repeat
+from ldm.lr_scheduler import LambdaLinearScheduler
 from ldm.models.autoencoder import *
 from ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage
 from ldm.models.diffusion.ddim import *
 from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.modules.midas.api import MiDaSInference
 from ldm.modules.diffusionmodules.model import *
 from ldm.modules.diffusionmodules.model import Decoder, Encoder, Model
 from ldm.modules.diffusionmodules.openaimodel import *
-from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d
+from ldm.modules.diffusionmodules.openaimodel import AttentionPool2d, UNetModel
 from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule, noise_like
 from ldm.modules.distributions.distributions import DiagonalGaussianDistribution, normal_kl
+from ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
 from ldm.modules.ema import LitEma
 from ldm.modules.encoders.modules import *
-from ldm.util import count_params, default, exists, instantiate_from_config, isimage, ismap, log_txt_as_img, mean_flat
+from ldm.util import count_params, default, exists, isimage, ismap, log_txt_as_img, mean_flat
 from omegaconf import ListConfig
 from torch.optim.lr_scheduler import LambdaLR
 from torchvision.utils import make_grid
@@ -690,7 +693,7 @@ class LatentDiffusion(DDPM):
             self.make_cond_schedule()
 
     def instantiate_first_stage(self, config):
-        model = instantiate_from_config(config)
+        model = AutoencoderKL(**config)
         self.first_stage_model = model.eval()
         self.first_stage_model.train = disabled_train
         for param in self.first_stage_model.parameters():
@@ -706,15 +709,13 @@ class LatentDiffusion(DDPM):
                 self.cond_stage_model = None
                 # self.be_unconditional = True
             else:
-                model = instantiate_from_config(config)
+                model = FrozenOpenCLIPEmbedder(**config)
                 self.cond_stage_model = model.eval()
                 self.cond_stage_model.train = disabled_train
                 for param in self.cond_stage_model.parameters():
                     param.requires_grad = False
         else:
-            assert config != '__is_first_stage__'
-            assert config != '__is_unconditional__'
-            model = instantiate_from_config(config)
+            model = FrozenOpenCLIPEmbedder(**config)
             self.cond_stage_model = model
 
     def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
@@ -1479,8 +1480,7 @@ class LatentDiffusion(DDPM):
 
         # opt = torch.optim.AdamW(params, lr=lr)
         if self.use_scheduler:
-            assert 'target' in self.scheduler_config
-            scheduler = instantiate_from_config(self.scheduler_config)
+            scheduler = LambdaLinearScheduler(**self.scheduler_config)
 
             rank_zero_info("Setting up LambdaLR scheduler...")
             scheduler = [{'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule), 'interval': 'step', 'frequency': 1}]
@@ -1502,7 +1502,7 @@ class DiffusionWrapper(pl.LightningModule):
     def __init__(self, diff_model_config, conditioning_key):
         super().__init__()
         self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
-        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.diffusion_model = UNetModel(**diff_model_config)
         self.conditioning_key = conditioning_key
         assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
 
@@ -1551,7 +1551,7 @@ class LatentUpscaleDiffusion(LatentDiffusion):
         self.noise_level_key = noise_level_key
 
     def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
+        model = ImageConcatWithNoiseAugmentation(**config)
         self.low_scale_model = model.eval()
         self.low_scale_model.train = disabled_train
         for param in self.low_scale_model.parameters():
@@ -1933,7 +1933,7 @@ class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
 
     def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs):
         super().__init__(concat_keys=concat_keys, *args, **kwargs)
-        self.depth_model = instantiate_from_config(depth_stage_config)
+        self.depth_model = MiDaSInference(**depth_stage_config)
         self.depth_stage_key = concat_keys[0]
 
     @torch.no_grad()
@@ -2006,7 +2006,7 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
             self.low_scale_key = low_scale_key
 
     def instantiate_low_stage(self, config):
-        model = instantiate_from_config(config)
+        model = ImageConcatWithNoiseAugmentation(**config)
         self.low_scale_model = model.eval()
         self.low_scale_model.train = disabled_train
         for param in self.low_scale_model.parameters():
diff --git a/examples/images/diffusion/main.py b/examples/images/diffusion/main.py
index 91b809d5a..e31d75e08 100644
--- a/examples/images/diffusion/main.py
+++ b/examples/images/diffusion/main.py
@@ -10,11 +10,8 @@ import time
 import numpy as np
 import torch
 import torchvision
+import lightning.pytorch as pl
 
-try:
-    import lightning.pytorch as pl
-except:
-    import pytorch_lightning as pl
 
 from functools import partial
 
@@ -23,19 +20,15 @@ from packaging import version
 from PIL import Image
 from prefetch_generator import BackgroundGenerator
 from torch.utils.data import DataLoader, Dataset, Subset, random_split
+from ldm.models.diffusion.ddpm import LatentDiffusion
 
-try:
-    from lightning.pytorch import seed_everything
-    from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
-    from lightning.pytorch.trainer import Trainer
-    from lightning.pytorch.utilities import rank_zero_info, rank_zero_only
-    LIGHTNING_PACK_NAME = "lightning.pytorch."
-except:
-    from pytorch_lightning import seed_everything
-    from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
-    from pytorch_lightning.trainer import Trainer
-    from pytorch_lightning.utilities import rank_zero_info, rank_zero_only
-    LIGHTNING_PACK_NAME = "pytorch_lightning."
+from lightning.pytorch import seed_everything
+from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
+from lightning.pytorch.trainer import Trainer
+from lightning.pytorch.utilities import rank_zero_info, rank_zero_only
+from lightning.pytorch.loggers import WandbLogger, TensorBoardLogger
+from lightning.pytorch.strategies import ColossalAIStrategy,DDPStrategy
+LIGHTNING_PACK_NAME = "lightning.pytorch."
 
 from ldm.data.base import Txt2ImgIterableBaseDataset
 from ldm.util import instantiate_from_config
@@ -687,153 +680,114 @@ if __name__ == "__main__":
             config.model["params"].update({"ckpt": ckpt})
             rank_zero_info("Using ckpt_path = {}".format(config.model["params"]["ckpt"]))
 
-        model = instantiate_from_config(config.model)
+        model = LatentDiffusion(**config.model.get("params", dict()))
         # trainer and callbacks
         trainer_kwargs = dict()
 
         # config the logger
         # Default logger configs to  log training metrics during the training process.
-        # These loggers are specified as targets in the dictionary, along with the configuration settings specific to each logger.
         default_logger_cfgs = {
             "wandb": {
-                "target": LIGHTNING_PACK_NAME + "loggers.WandbLogger",
-                "params": {
                     "name": nowname,
                     "save_dir": logdir,
                     "offline": opt.debug,
                     "id": nowname,
                 }
-            },
+            ,
             "tensorboard": {
-                "target": LIGHTNING_PACK_NAME + "loggers.TensorBoardLogger",
-                "params": {
                     "save_dir": logdir,
                     "name": "diff_tb",
                     "log_graph": True
                 }
-            }
         }
 
         # Set up the logger for TensorBoard
         default_logger_cfg = default_logger_cfgs["tensorboard"]
         if "logger" in lightning_config:
             logger_cfg = lightning_config.logger
+            trainer_kwargs["logger"] = WandbLogger(**logger_cfg)
         else:
             logger_cfg = default_logger_cfg
-        logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg)
-        trainer_kwargs["logger"] = instantiate_from_config(logger_cfg)
+            trainer_kwargs["logger"] = TensorBoardLogger(**logger_cfg)
 
         # config the strategy, defualt is ddp
         if "strategy" in trainer_config:
             strategy_cfg = trainer_config["strategy"]
-            strategy_cfg["target"] = LIGHTNING_PACK_NAME + strategy_cfg["target"]
+            trainer_kwargs["strategy"] = ColossalAIStrategy(**strategy_cfg)
         else:
-            strategy_cfg = {
-                "target": LIGHTNING_PACK_NAME + "strategies.DDPStrategy",
-                "params": {
-                    "find_unused_parameters": False
-                }
-            }
-
-        trainer_kwargs["strategy"] = instantiate_from_config(strategy_cfg)
+            strategy_cfg = {"find_unused_parameters": False}
+            trainer_kwargs["strategy"] = DDPStrategy(**strategy_cfg)
 
         # Set up ModelCheckpoint callback to save best models
         # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to
         # specify which metric is used to determine best models
         default_modelckpt_cfg = {
-            "target": LIGHTNING_PACK_NAME + "callbacks.ModelCheckpoint",
-            "params": {
                 "dirpath": ckptdir,
                 "filename": "{epoch:06}",
                 "verbose": True,
                 "save_last": True,
             }
-        }
         if hasattr(model, "monitor"):
-            default_modelckpt_cfg["params"]["monitor"] = model.monitor
-            default_modelckpt_cfg["params"]["save_top_k"] = 3
+            default_modelckpt_cfg["monitor"] = model.monitor
+            default_modelckpt_cfg["save_top_k"] = 3
 
         if "modelcheckpoint" in lightning_config:
-            modelckpt_cfg = lightning_config.modelcheckpoint
+            modelckpt_cfg = lightning_config.modelcheckpoint["params"]
         else:
             modelckpt_cfg = OmegaConf.create()
         modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg)
         if version.parse(pl.__version__) < version.parse('1.4.0'):
-            trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg)
+            trainer_kwargs["checkpoint_callback"] = ModelCheckpoint(**modelckpt_cfg)
 
-        # Set up various callbacks, including logging, learning rate monitoring, and CUDA management
-        # add callback which sets up log directory
-        default_callbacks_cfg = {
-            "setup_callback": {                           # callback to set up the training
-                "target": "main.SetupCallback",
-                "params": {
-                    "resume": opt.resume,                 # resume training if applicable
-                    "now": now, 
-                    "logdir": logdir,                     # directory to save the log file
-                    "ckptdir": ckptdir,                   # directory to save the checkpoint file
-                    "cfgdir": cfgdir,                     # directory to save the configuration file
-                    "config": config,                     # configuration dictionary
-                    "lightning_config": lightning_config, # LightningModule configuration
-                }
-            },
-            "image_logger": {                             # callback to log image data
-                "target": "main.ImageLogger",
-                "params": {
-                    "batch_frequency": 750,               # how frequently to log images
-                    "max_images": 4,                      # maximum number of images to log
-                    "clamp": True                         # whether to clamp pixel values to [0,1]
-                }
-            },
-            "learning_rate_logger": {                     # callback to log learning rate
-                "target": "main.LearningRateMonitor",
-                "params": {
-                    "logging_interval": "step",           # logging frequency (either 'step' or 'epoch')
-        # "log_momentum": True                            # whether to log momentum (currently commented out)
-                }
-            },
-            "cuda_callback": {                            # callback to handle CUDA-related operations
-                "target": "main.CUDACallback"
-            },
-        }
+        #Create an empty OmegaConf configuration object
 
-        # If the LightningModule configuration has specified callbacks, use those
-        # Otherwise, create an empty OmegaConf configuration object
-        if "callbacks" in lightning_config:
-            callbacks_cfg = lightning_config.callbacks
-        else:
-            callbacks_cfg = OmegaConf.create()
-        
-        # If the 'metrics_over_trainsteps_checkpoint' callback is specified in the
-        # LightningModule configuration, update the default callbacks configuration
-        if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg:
-            print(
-                'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.')
-            default_metrics_over_trainsteps_ckpt_dict = {
-                'metrics_over_trainsteps_checkpoint': {
-                    "target": LIGHTNING_PACK_NAME + 'callbacks.ModelCheckpoint',
-                    'params': {
-                        "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
-                        "filename": "{epoch:06}-{step:09}",
-                        "verbose": True,
-                        'save_top_k': -1,
-                        'every_n_train_steps': 10000,
-                        'save_weights_only': True
-                    }
-                }
+        callbacks_cfg = OmegaConf.create()
+       
+        #Instantiate items according to the configs
+        trainer_kwargs.setdefault("callbacks", [])
+        setup_callback_config = {
+            "resume": opt.resume,                 # resume training if applicable
+            "now": now, 
+            "logdir": logdir,                     # directory to save the log file
+            "ckptdir": ckptdir,                   # directory to save the checkpoint file
+            "cfgdir": cfgdir,                     # directory to save the configuration file
+            "config": config,                     # configuration dictionary
+            "lightning_config": lightning_config, # LightningModule configuration
             }
-            default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict)
+        trainer_kwargs["callbacks"].append(SetupCallback(**setup_callback_config))
         
-        # Merge the default callbacks configuration with the specified callbacks configuration, and instantiate the callbacks
-        callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg)
-
-        trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg]
+        image_logger_config = {
+    
+            "batch_frequency": 750,               # how frequently to log images
+            "max_images": 4,                      # maximum number of images to log
+            "clamp": True                         # whether to clamp pixel values to [0,1]
+            }
+        trainer_kwargs["callbacks"].append(ImageLogger(**image_logger_config))
+        
+        learning_rate_logger_config = {
+            "logging_interval": "step",           # logging frequency (either 'step' or 'epoch')
+        # "log_momentum": True                            # whether to log momentum (currently commented out)
+            }
+        trainer_kwargs["callbacks"].append(LearningRateMonitor(**learning_rate_logger_config))
+        
+        metrics_over_trainsteps_checkpoint_config= {
+            "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'),
+            "filename": "{epoch:06}-{step:09}",
+            "verbose": True,
+            'save_top_k': -1,
+            'every_n_train_steps': 10000,
+            'save_weights_only': True
+            }
+        trainer_kwargs["callbacks"].append(ModelCheckpoint(**metrics_over_trainsteps_checkpoint_config))
+        trainer_kwargs["callbacks"].append(CUDACallback())
 
         # Create a Trainer object with the specified command-line arguments and keyword arguments, and set the log directory
         trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs)
         trainer.logdir = logdir
 
         # Create a data module based on the configuration file
-        data = instantiate_from_config(config.data)
+        data = DataModuleFromConfig(**config.data)
+
         # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
         # calling these ourselves should not be necessary but it is.
         # lightning still takes care of proper multiprocessing though
@@ -846,7 +800,7 @@ if __name__ == "__main__":
 
         # Configure learning rate based on the batch size, base learning rate and number of GPUs
         # If scale_lr is true, calculate the learning rate based on additional factors
-        bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate
+        bs, base_lr = config.data.batch_size, config.model.base_learning_rate
         if not cpu:
             ngpu = trainer_config["devices"]
         else:
diff --git a/examples/images/diffusion/scripts/tests/test_checkpoint.py b/examples/images/diffusion/scripts/tests/test_checkpoint.py
index a32e66d44..13622c498 100644
--- a/examples/images/diffusion/scripts/tests/test_checkpoint.py
+++ b/examples/images/diffusion/scripts/tests/test_checkpoint.py
@@ -7,8 +7,9 @@ from datetime import datetime
 
 from diffusers import StableDiffusionPipeline
 import torch
-from ldm.util import instantiate_from_config
+
 from main import get_parser
+from ldm.modules.diffusionmodules.openaimodel import UNetModel
 
 if __name__ == "__main__":
     with torch.no_grad():
@@ -17,7 +18,7 @@ if __name__ == "__main__":
             config = f.read()
         base_config = yaml.load(config, Loader=yaml.FullLoader)
         unet_config = base_config['model']['params']['unet_config']
-        diffusion_model = instantiate_from_config(unet_config).to("cuda:0")
+        diffusion_model = UNetModel(**unet_config).to("cuda:0")
 
         pipe = StableDiffusionPipeline.from_pretrained(
             "/data/scratch/diffuser/stable-diffusion-v1-4"
diff --git a/examples/images/diffusion/train_colossalai.sh b/examples/images/diffusion/train_colossalai.sh
index c56ed7876..7f1a1bd14 100755
--- a/examples/images/diffusion/train_colossalai.sh
+++ b/examples/images/diffusion/train_colossalai.sh
@@ -3,3 +3,4 @@ TRANSFORMERS_OFFLINE=1
 DIFFUSERS_OFFLINE=1
 
 python main.py --logdir /tmp --train --base configs/Teyvat/train_colossalai_teyvat.yaml --ckpt diffuser_root_dir/512-base-ema.ckpt
+