convert the input temporal transformer

kabachuha · kabachuha · commit e8149f57c9f4 · 2023-04-17T16:38:59.000+03:00
diff --git a/utils/convert_diffusers_to_original_ms_text_to_video.py b/utils/convert_diffusers_to_original_ms_text_to_video.py
@@ -24,19 +24,19 @@
     ("time_embed.2.weight", "time_embedding.linear_2.weight"),
     ("time_embed.2.bias", "time_embedding.linear_2.bias"),
 
-    # from Modelscope only
-    ("label_emb.0.0.weight", "class_embedding.linear_1.weight"),
-    ("label_emb.0.0.bias", "class_embedding.linear_1.bias"),
-    ("label_emb.0.2.weight", "class_embedding.linear_2.weight"),
-    ("label_emb.0.2.bias", "class_embedding.linear_2.bias"),
+    # # from Modelscope only
+    # ("label_emb.0.0.weight", "class_embedding.linear_1.weight"),
+    # ("label_emb.0.0.bias", "class_embedding.linear_1.bias"),
+    # ("label_emb.0.2.weight", "class_embedding.linear_2.weight"),
+    # ("label_emb.0.2.bias", "class_embedding.linear_2.bias"),
 
     # from Vanilla ModelScope/StableDiffusion
     ("input_blocks.0.0.weight", "conv_in.weight"),
     ("input_blocks.0.0.bias", "conv_in.bias"),
 
     # from Modelscope only
-    ("input_blocks.0.1.weight", "transformer_in.weight"),
-    ("input_blocks.0.1.bias", "transformer_in.bias"),
+    #("input_blocks.0.1", "transformer_in.weight"),
+    #("input_blocks.0.1.bias", "transformer_in.bias"),
 
     # from Vanilla ModelScope/StableDiffusion
     ("out.0.weight", "conv_norm_out.weight"),
@@ -62,6 +62,9 @@
 
 unet_conversion_map_layer = []
 
+# Convert input TemporalTransformer
+unet_conversion_map_layer.append(('input_blocks.0.1', 'transformer_in'))
+
 # Reference for the default settings
 
 # "model_cfg": {
@@ -85,10 +88,10 @@
 
 # hardcoded number of downblocks and resnets/attentions...
 # would need smarter logic for other networks.
-for i in range(4):
+for i in range(4):# 4 UD/DOWN BLOCKS CONFIRMED --kabachuha
     # loop over downblocks/upblocks
 
-    for j in range(2):
+    for j in range(2): # 2 RESNET BLOCKS CONFIRMED --kabachuha
         # loop over resnets/attentions for downblocks
 
         # Spacial SD stuff
@@ -181,6 +184,8 @@ def convert_unet_state_dict(unet_state_dict):
     # the exact order in which I have arranged them.
     mapping = {k: k for k in unet_state_dict.keys()}
 
+
+
     for sd_name, hf_name in unet_conversion_map:
         mapping[hf_name] = sd_name
     for k, v in mapping.items():
@@ -444,3 +449,5 @@ def convert_text_enc_state_dict(text_enc_dict):
     else:
         state_dict = {"state_dict": state_dict}
         torch.save(state_dict, args.clip_checkpoint_path)
+    
+    print('Operation successfull')