Update adafactor so it can accept a callable learning rate.

T2T Team · copybara-github · commit 81c2b2eddbf0 · 2021-12-02T10:50:33.000-08:00
PiperOrigin-RevId: 413718409
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
@@ -122,7 +122,7 @@ def __init__(self,
 
     Args:
       multiply_by_parameter_scale: a boolean
-      learning_rate: an optional Scalar.
+      learning_rate: an optional Scalar or callable.
       decay_rate: an optional Scalar.
       beta1: a float value between 0 and 1
       clipping_threshold: an optional float >= 1
@@ -218,7 +218,9 @@ def _resource_apply_dense(self, grad, handle):
     grad_squared = tf.square(grad) + self._epsilon1
     grad_squared_mean = tf.reduce_mean(grad_squared)
     decay_rate = self._decay_rate
-    update_scale = self._learning_rate
+    update_scale = self._call_if_callable(self._learning_rate)
+    update_scale = tf.convert_to_tensor(update_scale, name="update_scale")
+    update_scale = tf.cast(update_scale, grad_squared_mean.dtype.base_dtype)
     old_val = var
     if var.dtype.base_dtype == tf.bfloat16:
       old_val = tf.to_float(self._parameter_encoding.decode(old_val))
@@ -272,6 +274,7 @@ def _resource_apply_dense(self, grad, handle):
       new_val = quantization.simulated_quantize(
           var - subtrahend, self._simulated_quantize_bits,
           self._quantization_noise)
+    new_val = tf.cast(new_val, var.dtype)
     var_update = tf.assign(var, new_val, use_locking=self._use_locking)
     updates = [var_update] + updates
     return tf.group(*updates)
diff --git a/tensor2tensor/utils/adafactor_test.py b/tensor2tensor/utils/adafactor_test.py
@@ -0,0 +1,44 @@
+# coding=utf-8
+# Copyright 2021 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for adafactor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.utils import adafactor
+
+import tensorflow as tf
+
+
+class AdafactorTest(tf.test.TestCase):
+
+  def testCallableLearningRate(self):
+    def lr():
+      return 0.01
+
+    opt = adafactor.AdafactorOptimizer(learning_rate=lr)
+    v1 = tf.Variable([1., 2.])
+    v2 = tf.Variable([3., 4.])
+    with tf.GradientTape() as tape:
+      tape.watch([v1, v2])
+      loss = v1 * v2
+    v1_grad, v2_grad = tape.gradient(loss, [v1, v2])
+    opt.apply_gradients(((v1_grad, v1), (v2_grad, v2)))
+
+
+if __name__ == '__main__':
+  tf.test.main()