Add flag for Torch 2 attention

ExponentialML · ExponentialML · commit ab143eb6ccf5 · 2023-03-28T13:08:39.000-07:00
diff --git a/configs/my_config.yaml b/configs/my_config.yaml
@@ -38,3 +38,6 @@ seed: 64
 mixed_precision: "fp16"
 use_8bit_adam: False # This seems to be incompatible at the moment. 
 enable_xformers_memory_efficient_attention: False
+
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
diff --git a/configs/offset_noise_finetune.yaml b/configs/offset_noise_finetune.yaml
@@ -45,3 +45,6 @@ use_8bit_adam: False # This seems to be incompatible at the moment.
 
 # Xformers must be installed
 enable_xformers_memory_efficient_attention: True
+
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
diff --git a/configs/single_video_config.yaml b/configs/single_video_config.yaml
@@ -38,3 +38,6 @@ seed: 64
 mixed_precision: "fp16"
 use_8bit_adam: False # This seems to be incompatible at the moment. 
 enable_xformers_memory_efficient_attention: False
+
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
diff --git a/train.py b/train.py
@@ -101,7 +101,7 @@ def set_torch_2_attn(unet):
     if optim_count > 0: 
         print(f"{optim_count} Attention layers using Scaled Dot Product Attention.")
 
-def handle_memory_attention(enable_xformers_memory_efficient_attention, unet): 
+def handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet): 
     try:
         is_torch_2 = hasattr(F, 'scaled_dot_product_attention')
 
@@ -111,7 +111,8 @@ def handle_memory_attention(enable_xformers_memory_efficient_attention, unet):
                 unet.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
             else:
                 raise ValueError("xformers is not available. Make sure it is installed correctly")
-        else:
+        
+        if enable_torch_2_attn and is_torch_2:
             set_torch_2_attn(unet)
     except:
         print("Could not enable memory efficient attention for xformers or Torch 2.0.")
@@ -230,6 +231,7 @@ def main(
     mixed_precision: Optional[str] = "fp16",
     use_8bit_adam: bool = False,
     enable_xformers_memory_efficient_attention: bool = True,
+    enable_torch_2_attn: bool = False,
     seed: Optional[int] = None,
     train_text_encoder: bool = False,
     use_offset_noise: bool = False,
@@ -268,7 +270,7 @@ def main(
     freeze_models([vae, text_encoder, unet])
     
     # Enable xformers if available
-    handle_memory_attention(enable_xformers_memory_efficient_attention, unet)
+    handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet)
 
     if scale_lr:
         learning_rate = (