Updates estimate_tails to avoid infinite loops and find better solutions:

minnend · copybara-github · commit c84b7afbef2e · 2022-09-14T13:09:30.000-07:00
1. Terminate the search loop if the loss is small. This avoid an infinite loop when the loss is NaN or when the initial guess is correct (a local minimum).

2. Keep track of the best value (lowest loss) encountered so far. This helps since the final value does not always have the lowest loss.

3. Reduce the learning (from 0.1 down to 0.01) during the 100 steps taken after a zero-crossing is found. This typically leads to a lower final loss.

PiperOrigin-RevId: 474370252
Change-Id: I6eab66547a8d920c7a12647681766140858faeb3
diff --git a/tensorflow_compression/python/distributions/helpers.py b/tensorflow_compression/python/distributions/helpers.py
@@ -57,32 +57,48 @@ def estimate_tails(func, target, shape, dtype):
     shape = tf.convert_to_tensor(shape, tf.int32)
     target = tf.convert_to_tensor(target, dtype)
 
-    def loop_cond(tails, m, v, count):
-      del tails, m, v  # unused
-      return tf.reduce_min(count) < 100
-
-    def loop_body(tails, prev_m, prev_v, count):
+    def loop_cond(tails, m, v, loss, count, best_tails, best_loss):
+      del tails, m, v, best_tails, best_loss  # unused
+      # By checking `loss`, we catch NaNs and protect against infinite loops
+      # from perfect initial guesses where there is no zero-crossing.
+      return tf.logical_and(tf.reduce_max(loss) > 1e-8,
+                            tf.reduce_min(count) < 100)
+
+    def loop_body(tails, prev_m, prev_v, loss, count, best_tails, best_loss):
+      del loss  # always recomputed
       with tf.GradientTape(watch_accessed_variables=False) as tape:
         tape.watch(tails)
         loss = abs(func(tails) - target)
+
+      # Keep track of the best (lowest loss) value so far.
+      condition = (loss < best_loss)
+      best_tails = tf.where(condition, tails, best_tails)
+      best_loss = tf.where(condition, loss, best_loss)
+
       grad = tape.gradient(loss, tails)
       m = (prev_m + grad) / 2  # Adam mean estimate.
       v = (prev_v + tf.square(grad)) / 2  # Adam variance estimate.
-      tails -= .1 * m / (tf.sqrt(v) + 1e-20)
+
+      # Reduce learning rate as count increases. This should lead to a more
+      # accurate final value.
+      k = tf.math.sqrt(tf.cast(count + 1, m.dtype))
+      tails -= 0.1 * m / (k * tf.sqrt(v) + 1e-20)
+
       # Start counting when the gradient flips sign. Since the function is
       # monotonic, m must have the same sign in all initial iterations, until
       # the optimal point is crossed. At that point the gradient flips sign.
-      count = tf.where(
-          tf.math.logical_or(count > 0, prev_m * grad < 0),
-          count + 1, count)
-      return tails, m, v, count
+      count = tf.where(tf.math.logical_or(count > 0, prev_m * grad < 0),
+                       count + 1, count)
+      return tails, m, v, loss, count, best_tails, best_loss
 
     init_tails = tf.zeros(shape, dtype=dtype)
     init_m = tf.zeros(shape, dtype=dtype)
     init_v = tf.ones(shape, dtype=dtype)
+    init_loss = init_v * dtype.max
     init_count = tf.zeros(shape, dtype=tf.int32)
-    return tf.while_loop(
-        loop_cond, loop_body, (init_tails, init_m, init_v, init_count))[0]
+    loop_vars = (init_tails, init_m, init_v, init_loss, init_count,
+                 init_tails, init_loss)
+    return tf.while_loop(loop_cond, loop_body, loop_vars)[-2]
 
 
 def quantization_offset(distribution):
diff --git a/tensorflow_compression/python/distributions/helpers_test.py b/tensorflow_compression/python/distributions/helpers_test.py
@@ -22,6 +22,16 @@
 
 class HelpersTest(tf.test.TestCase):
 
+  def test_nan_terminates(self):
+    # Return a NaN tensor that would otherwise have a gradient wrt x.
+    func = lambda x: tf.math.tanh(x) * float("nan")
+    helpers.estimate_tails(func, target=0.5, shape=(), dtype=tf.float32)
+
+  def test_perfect_initial_guess_terminates(self):
+    # The initial guess is zero, which causes problems if the minimum is also
+    # at zero since then there's no zero-crossing to trigger the count.
+    helpers.estimate_tails(tf.math.tanh, target=0.0, shape=(), dtype=tf.float32)
+
   def test_cauchy_quantizes_to_mode_decimal_part(self):
     dist = tfp.distributions.Cauchy(loc=1.4, scale=3.)
     self.assertAllClose(helpers.quantization_offset(dist), 0.4)