Update train.py

ExponentialML · web-flow · commit 2a48e2d364e5 · 2023-03-30T00:56:09.000-07:00
Sometimes the smallest changes make the biggest difference.

Also, add text encoder training params.
diff --git a/train.py b/train.py
@@ -377,7 +377,9 @@ def finetune_unet(batch, train_encoder=False):
 
         # Set noise scheduler to cosine (this can be done via config, but this ensures it's enabled)
         #noise_scheduler.beta_schedule = "squaredcos_cap_v2"
-
+        
+        unet.train()
+        
         # Convert videos to latent space
         pixel_values = batch["pixel_values"].to(weight_dtype)
 
@@ -398,6 +400,10 @@ def finetune_unet(batch, train_encoder=False):
         # Enable text encoder training
         if train_encoder:
             text_encoder.train()
+            cast_to_gpu_and_type([text_encoder], accelerator, torch.float32)    
+            text_encoder.requires_grad_(True)
+        else:
+            text_encoder.requires_grad_(False)
 
         enable_trainable_unet_modules(unet, trainable_modules, is_enabled=True)
 
@@ -420,7 +426,6 @@ def finetune_unet(batch, train_encoder=False):
 
     for epoch in range(first_epoch, num_train_epochs):
         train_loss = 0.0
-        unet.train()
         
         for step, batch in enumerate(train_dataloader):
             # Skip steps until we reach the resumed step
@@ -483,7 +488,9 @@ def finetune_unet(batch, train_encoder=False):
                     if global_step == 1: print("Performing validation prompt.")
                     if accelerator.is_main_process:
                         with accelerator.autocast():
-
+                            unet.eval()
+                            text_encoder.eval()
+                            
                             pipeline = TextToVideoSDPipeline.from_pretrained(
                                 pretrained_model_path,
                                 text_encoder=text_encoder,