From 2a9f98a6b364925c42031631789ede4bbf29bada Mon Sep 17 00:00:00 2001
From: wyfEmma <wyf@google.com>
Date: Tue, 8 Jul 2025 16:00:55 +0000
Subject: [PATCH 1/5] Your descriptive commit message here

---
 submission_runner.py                          |   1 +
 .../schedule_free_adamw/__init__.py           |   0
 .../schedule_free_adamw/jax/__init__.py       |   0
 .../schedule_free_adamw/jax/schedule.py       | 724 ++++++++++++++++++
 .../jax/schedule_free_optax.py                | 657 ++++++++++++++++
 .../schedule_free_adamw/jax/submission.py     | 222 ++++++
 .../schedule_free_adamw/pytorch/__init__.py   |   0
 .../schedule_free_adamw/pytorch/submission.py | 328 ++++++++
 8 files changed, 1932 insertions(+)
 create mode 100644 tests/test_algorithms/schedule_free_adamw/__init__.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/jax/__init__.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/jax/schedule.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/jax/schedule_free_optax.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/jax/submission.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/pytorch/__init__.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/pytorch/submission.py

diff --git a/submission_runner.py b/submission_runner.py
index 468a04c7c..a042bed99 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -20,6 +20,7 @@
 from inspect import signature
 import itertools
 import json
+import jax
 import os
 import struct
 import time
diff --git a/tests/test_algorithms/schedule_free_adamw/__init__.py b/tests/test_algorithms/schedule_free_adamw/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/__init__.py b/tests/test_algorithms/schedule_free_adamw/jax/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/schedule.py b/tests/test_algorithms/schedule_free_adamw/jax/schedule.py
new file mode 100644
index 000000000..6c05153c6
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/jax/schedule.py
@@ -0,0 +1,724 @@
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Optax Schedules.
+
+Schedules may be used to anneal the value of a hyper-parameter over time; for
+instance, they may be used to anneal the learning rate used to update an agent's
+parameters or the exploration factor used to select actions.
+"""
+
+from typing import Iterable, Optional, Union
+
+from absl import logging
+import chex
+import jax
+import jax.numpy as jnp
+import numpy as np
+from optax._src import base
+# from optax.schedules import _join
+
+from typing import Sequence
+
+import chex
+import jax.numpy as jnp
+
+from optax._src import base
+
+
+def join_schedules(
+    schedules: Sequence[base.Schedule],
+    boundaries: Sequence[int]
+) -> base.Schedule:
+  """Sequentially apply multiple schedules.
+
+  Args:
+    schedules: A list of callables (expected to be optax schedules). Each
+      schedule will receive a step count indicating the number of steps since
+      the previous boundary transition.
+    boundaries: A list of integers (of length one less than schedules) that
+      indicate when to transition between schedules.
+  Returns:
+    schedule: A function that maps step counts to values.
+  """
+  def schedule(step: chex.Numeric) -> chex.Numeric:
+    output = schedules[0](step)
+    for boundary, schedule in zip(boundaries, schedules[1:]):
+      output = jnp.where(step < boundary, output, schedule(step - boundary))
+    return output
+  return schedule
+
+
+def constant_schedule(value: Union[float, int]) -> base.Schedule:
+  """Constructs a constant schedule.
+
+  Examples:
+    >>> schedule_fn = optax.constant_schedule(5)
+    >>> schedule_fn(0)
+    5
+    >>> schedule_fn(100)
+    5
+
+  Args:
+    value: value to be held constant throughout.
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  return lambda count: value
+
+
+def polynomial_schedule(
+    init_value: chex.Scalar,
+    end_value: chex.Scalar,
+    power: chex.Scalar,
+    transition_steps: int,
+    transition_begin: int = 0,
+) -> base.Schedule:
+  r"""Constructs a schedule with polynomial transition from init to end value.
+
+  This function transitions the learning rate from an initial value
+  (``init_value``) to a final value (``end_value``) over a specified number of
+  steps (``transition_steps``) with a polynomial function of power ``power``.
+  The transition can optionally begin after a specified number of initial steps
+  (``transition_begin``).
+
+  More precisely, the learning rate at iteration :math:`t` is given by:
+
+  .. math::
+    \begin{cases}
+      I, & \text{if } t < B \\
+      (I - E) \left( 1 - \frac{t - B}{T} \right)^{P} + E, &
+        \text{if } B \leq t < B + T \\
+      E, & \text{if } t \geq B + T
+    \end{cases}
+
+  where :math:`I` is the initial value, :math:`E` is the end value,
+  :math:`B` is the transition begin, :math:`T` is the transition steps,
+  and :math:`P` is the power used for the polynomial transition.
+
+  Examples:
+    >>> schedule_fn = optax.polynomial_schedule(
+    ...    init_value=1.0, end_value=0.01, transition_steps=100, power=2)
+    >>> schedule_fn(0)  # learning rate on the first iteration
+    Array(1., dtype=float32, weak_type=True)
+    >>> schedule_fn(100)  # learning rate on the last iteration
+    Array(0.01, dtype=float32, weak_type=True)
+
+    The following example uses a non-zero ``transition_begin``. In this case
+    the learning rate is kept constant for the first ``transition_begin``
+    iterations:
+
+    >>> schedule_fn = optax.polynomial_schedule(
+    ...    init_value=1.0,
+    ...    end_value=0.01,
+    ...    transition_steps=100,
+    ...    transition_begin=5,
+    ...    power=2,
+    ... )
+    >>> counts = [0, 5, 6, 104, 105, 110]
+    >>> print(
+    ...    *[f'count:{i} value:{schedule_fn(i):.4f}' for i in counts],
+    ...    sep='\n')
+    count:0 value:1.0000
+    count:5 value:1.0000
+    count:6 value:0.9803
+    count:104 value:0.0101
+    count:105 value:0.0100
+    count:110 value:0.0100
+
+  Args:
+    init_value: initial value for the scalar to be annealed.
+    end_value: end value of the scalar to be annealed.
+    power: the power of the polynomial used to transition from init to end.
+    transition_steps: number of steps over which annealing takes place.
+      The scalar starts changing at ``transition_begin`` steps and completes
+      the transition by ``transition_begin + transition_steps`` steps.
+      If ``transition_steps <= 0``, then the entire annealing process is
+      disabled and the value is held fixed at ``init_value``.
+    transition_begin: must be positive. After how many steps to start annealing
+      (before this many steps the scalar value is held fixed at ``init_value``).
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  if transition_steps <= 0:
+    logging.info(
+        'A polynomial schedule was set with a non-positive `transition_steps` '
+        'value; this results in a constant schedule with value `init_value`.'
+    )
+    return lambda count: init_value
+
+  if transition_begin < 0:
+    logging.info(
+        'A polynomial schedule was set with a negative `transition_begin` '
+        'value; this will result in `transition_begin` falling back to `0`.'
+    )
+    transition_begin = 0
+
+  def schedule(count):
+    count = jnp.clip(count - transition_begin, 0, transition_steps)
+    frac = 1 - count / transition_steps
+    return (init_value - end_value) * (frac**power) + end_value
+
+  return schedule
+
+
+def linear_schedule(
+    init_value: chex.Scalar,
+    end_value: chex.Scalar,
+    transition_steps: int,
+    transition_begin: int = 0,
+) -> base.Schedule:
+  r"""Schedule with linear transition from ``init_value`` to ``end_value``.
+
+  More precisely, the learning rate at iteration :math:`t` is given by:
+
+  .. math::
+    \begin{cases}
+      I, & \text{if } t < B \\
+      I + \frac{t - B}{T} (E - I), & \text{if } B \leq t < B + T \\
+      E, & \text{if } t \geq B + T
+    \end{cases}
+
+  where :math:`I` is the initial value, :math:`E` is the end value,
+  :math:`B` is the transition begin, and :math:`T` is the transition steps.
+
+  This schedule is equivalent to :func:`optax.polynomial_schedule` with
+  ``power=1``.
+
+  Examples:
+    >>> schedule_fn = optax.linear_schedule(
+    ...    init_value=1.0, end_value=0.01, transition_steps=100)
+    >>> schedule_fn(0)  # learning rate on the first iteration
+    Array(1., dtype=float32, weak_type=True)
+    >>> schedule_fn(100)  # learning rate on the last iteration
+    Array(0.01, dtype=float32, weak_type=True)
+
+  Args:
+    init_value: initial value for the scalar to be annealed.
+    end_value: end value of the scalar to be annealed.
+    transition_steps: number of steps over which annealing takes place. The
+      scalar starts changing at ``transition_begin`` steps and completes the
+      transition by ``transition_begin + transition_steps`` steps. If
+      ``transition_steps <= 0``, then the entire annealing process is disabled
+      and the value is held fixed at ``init_value``.
+    transition_begin: must be positive. After how many steps to start annealing
+      (before this many steps the scalar value is held fixed at ``init_value``).
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  return polynomial_schedule(
+      init_value=init_value,
+      end_value=end_value,
+      power=1,
+      transition_steps=transition_steps,
+      transition_begin=transition_begin,
+  )
+
+
+def piecewise_constant_schedule(
+    init_value: float, boundaries_and_scales: Optional[dict[int, float]] = None
+) -> base.Schedule:
+  """Returns a function which implements a piecewise constant schedule.
+
+  Args:
+    init_value: An initial value ``init_v``.
+    boundaries_and_scales: A map from boundaries ``b_i`` to non-negative scaling
+      factors ``f_i``. For any step count `s`, the schedule returns ``init_v``
+      scaled by the product of all factors ``f_i`` such that ``b_i < s``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  if boundaries_and_scales is not None:
+    all_positive = all(scale >= 0.0 for scale in boundaries_and_scales.values())
+    if not all_positive:
+      raise ValueError(
+          '`piecewise_constant_schedule` expects non-negative scale factors'
+      )
+
+  def schedule(count):
+    v = init_value
+    if boundaries_and_scales is not None:
+      for threshold, scale in sorted(boundaries_and_scales.items()):
+        indicator = jnp.maximum(0.0, jnp.sign(threshold - count))
+        v = v * indicator + (1 - indicator) * scale * v
+    return v
+
+  return schedule
+
+
+def exponential_decay(
+    init_value: float,
+    transition_steps: int,
+    decay_rate: float,
+    transition_begin: int = 0,
+    staircase: bool = False,
+    end_value: Optional[float] = None,
+) -> base.Schedule:
+  """Constructs a schedule with either continuous or discrete exponential decay.
+
+  This function applies an exponential decay function to a provided initial
+  value. When ``count >= transition_begin`` the function returns the decayed
+  value as:
+
+  .. code-block::
+
+    rate_factor = ((count - transition_begin) / transition_steps)
+    decayed_value = init_value * (decay_rate ** rate_factor)
+
+  If the argument ``staircase`` is ``True`` then ``count / transition_steps`` is
+  an integer division and the decayed value follows a staircase function.
+
+  Args:
+    init_value: the initial learning rate.
+    transition_steps: must be positive. See the decay computation above.
+    decay_rate: must not be zero. The decay rate.
+    transition_begin: must be positive. After how many steps to start annealing
+      (before this many steps the scalar value is held fixed at `init_value`).
+    staircase: if ``True``, decay the values at discrete intervals.
+    end_value: the value at which the exponential decay stops. When ``decay_rate
+      < 1``, ``end_value`` is treated as a lower bound, otherwise as an upper
+      bound. Has no effect when ``decay_rate = 0``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+
+  if transition_steps <= 0:
+    logging.info(
+        'An exponential schedule was set with a non-positive `transition_steps`'
+        ' value; this will result in a constant schedule with value '
+        '`init_value`.'
+    )
+    return lambda count: init_value
+
+  if decay_rate == 0:
+    logging.info(
+        'An exponential schedule was set with a zero `decay_rate` value; '
+        'this will result in a constant schedule with value `init_value`.'
+    )
+    return lambda count: init_value
+
+  if transition_begin < 0:
+    logging.info(
+        'An exponential schedule was set with a negative `transition_begin` '
+        'value; this will result in `transition_begin` falling back to `0`.'
+    )
+    transition_begin = 0
+
+  if end_value is not None:
+    clip_fn = jnp.maximum if decay_rate < 1.0 else jnp.minimum
+
+  def schedule(count):
+    decreased_count = count - transition_begin
+    p = decreased_count / transition_steps
+    if staircase:
+      p = jnp.floor(p)
+    decayed_value = jnp.where(
+        decreased_count <= 0, init_value, init_value * jnp.power(decay_rate, p)
+    )
+    if end_value is not None:
+      decayed_value = clip_fn(decayed_value, end_value)
+    return decayed_value
+
+  return schedule
+
+
+def cosine_decay_schedule(
+    init_value: float,
+    decay_steps: int,
+    alpha: float = 0.0,
+    exponent: float = 1.0,
+) -> base.Schedule:
+  r"""Returns a function which implements cosine learning rate decay.
+
+  This schedule smoothly decreases the learning rate over a specified number of
+  steps (``decay_steps``). The decay follows a cosine function, with an optional
+  exponent to modify the decay curve. A minimum value (``alpha``) ensures the
+  learning rate does not drop entirely to zero.
+
+  More precisely, the learning rate at iteration :math:`t` is given by:
+
+  .. math::
+    \begin{cases}
+      \frac{I (1 - \alpha)}{2}(1+\cos(\pi\,\frac{t}{T})^p) + I \alpha\, 
+      & \text{if } t \leq T \\
+      I \alpha, & \text{if } t > T 
+    \end{cases}
+
+  where :math:`T` is the number of decay steps (``decay_steps``), :math:`p` is
+  the ``exponent`` and :math:`I` is the initial value (``init_value``).
+
+  References:
+    Loshchilov et al., `SGDR: Stochastic Gradient Descent with Warm Restarts
+    <https://arxiv.org/abs/1608.03983>`_, 2017
+
+  Args:
+    init_value: An initial value for the learning rate.
+    decay_steps: Positive integer - the number of steps for which to apply
+      the decay for.
+    alpha: The minimum value of the multiplier used to adjust the
+      learning rate. Defaults to 0.0.
+    exponent:  The default decay is ``0.5 * (1 + cos(pi * t/T))``, where 
+      ``t`` is the current timestep and ``T`` is the ``decay_steps``. The
+      exponent modifies this to be ``(0.5 * (1 + cos(pi * t/T))) ** exponent``.
+      Defaults to 1.0.
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  if not decay_steps > 0:
+    raise ValueError(
+        'The cosine_decay_schedule requires positive decay_steps, got'
+        f' {decay_steps=}.'
+    )
+
+  def schedule(count):
+    # Avoid int -> int32 overflow in jitted code.
+    nonlocal decay_steps
+    decay_steps, count = jax.tree.map(
+        lambda x: float(x) if isinstance(x, int) else x, (decay_steps, count)
+    )
+
+    count = jnp.minimum(count, decay_steps)
+    cosine_decay = 0.5 * (1 + jnp.cos(jnp.pi * count / decay_steps))
+    decayed = (1 - alpha) * cosine_decay**exponent + alpha
+    return init_value * decayed
+
+  return schedule
+
+
+def _linear_interpolate(start: float, end: float, pct: float):
+  return (end - start) * pct + start
+
+
+def _cosine_interpolate(start: float, end: float, pct: float):
+  return end + (start - end) / 2.0 * (jnp.cos(jnp.pi * pct) + 1)
+
+
+def piecewise_interpolate_schedule(
+    interpolate_type: str,
+    init_value: float,
+    boundaries_and_scales: Optional[dict[int, float]] = None,
+) -> base.Schedule:
+  """Returns a function which implements a piecewise interpolated schedule.
+
+  Args:
+    interpolate_type: 'linear' or 'cosine', specifying the interpolation
+      strategy.
+    init_value: An initial value ``init_v``.
+    boundaries_and_scales: A map from boundaries ``b_i`` to non-negative scaling
+      factors ``f_i``. At boundary step ``b_i``, the schedule returns ``init_v``
+      scaled by the product of all factors ``f_j`` such that ``b_j <= b_i``. The
+      values in between each boundary will be interpolated as per ``type``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values.
+  """
+  if interpolate_type == 'linear':
+    interpolate_fn = _linear_interpolate
+  elif interpolate_type == 'cosine':
+    interpolate_fn = _cosine_interpolate
+  else:
+    raise ValueError("`interpolate_type` must be either 'cos' or 'linear'")
+
+  if boundaries_and_scales:
+    boundaries, scales = zip(*sorted(boundaries_and_scales.items()))
+    if not all(scale >= 0.0 for scale in scales):
+      raise ValueError(
+          '`piecewise_interpolate_schedule` expects non-negative scale factors'
+      )
+  else:
+    boundaries, scales = (), ()
+
+  bounds = np.stack((0,) + boundaries)
+  values = np.cumprod(np.stack((init_value,) + scales))
+  interval_sizes = bounds[1:] - bounds[:-1]
+
+  def schedule(count):
+    indicator = (bounds[:-1] <= count) & (count < bounds[1:])
+    pct = (count - bounds[:-1]) / interval_sizes
+    interp_vals = interpolate_fn(values[:-1], values[1:], pct)
+    return indicator.dot(interp_vals) + (bounds[-1] <= count) * values[-1]
+
+  return schedule
+
+
+def linear_onecycle_schedule(
+    transition_steps: int,
+    peak_value: float,
+    pct_start: float = 0.3,
+    pct_final: float = 0.85,
+    div_factor: float = 25.0,
+    final_div_factor: float = 1e4,
+) -> base.Schedule:
+  r"""Returns a learning rate with three linear phases.
+
+  * *Phase 1*, from iteration 0 to ``pct_start * transition_steps``. The
+    learning rate increases linearly from ``peak_value / div_factor`` to
+    ``peak_value``.
+  * *Phase 2*, from iteration ``pct_start * transition_steps`` to
+    ``pct_final * transition_steps``. The learning rate decreases linearly from
+    ``peak_value`` back to the initial ``peak_value/div_factor``.
+  * *Phase 3*: For the remaining steps, the learning rate interpolates between
+    ``peak_value/div_factor`` and ``peak_value / final_div_factor``. If
+    ``final_div_factor`` is larger than ``div_factor``, this is a decreasing
+    phase.
+
+
+  References:
+    Smith et al, `Super-Convergence: Very Fast Training of Neural Networks Using
+    Large Learning Rates <https://arxiv.org/abs/1708.07120>`_, 2017
+
+
+  Args:
+    transition_steps: Number of steps over which annealing takes place.
+    peak_value: Maximum value attained by schedule at pct_start percent of the
+      cycle (in number of steps).
+    pct_start: The percentage of the cycle (in number of steps) spent increasing
+      the learning rate.
+    pct_final: The percentage of the cycle (in number of steps) spent increasing
+      to ``peak_value`` then decreasing back to ``init_value``.
+    div_factor: Determines the initial value via ``init_value = peak_value /
+      div_factor``.
+    final_div_factor: Determines the final value via ``final_value = init_value
+      / final_div_factor``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  if transition_steps <= 0:
+    raise ValueError(
+        'A linear onecycle schedule was set with a non-positive '
+        '`transition_steps`'
+    )
+
+  return piecewise_interpolate_schedule(
+      'linear',
+      peak_value / div_factor,
+      {
+          int(pct_start * transition_steps): div_factor,
+          int(pct_final * transition_steps): 1.0 / div_factor,
+          transition_steps: 1.0 / final_div_factor,
+      },
+  )
+
+
+def cosine_onecycle_schedule(
+    transition_steps: int,
+    peak_value: float,
+    pct_start: float = 0.3,
+    div_factor: float = 25.0,
+    final_div_factor: float = 1e4,
+) -> base.Schedule:
+  """Returns a function which implements the onecycle learning rate schedule.
+
+  This learning rate increases the learning rate and then decreases it in a
+  cosine-like manner. The number of steps over which the learning rate increases
+  is determined by the ``pct_start`` argument. The maximum value of the learning
+  rate is determined by the ``peak_value`` argument, the initial value of the
+  learning rate is determined through the formula ``init_value = peak_value /
+  div_factor``, and the final value is determined by the ``final_div_factor``
+  argument.
+
+  References:
+    Smith et al, `Super-Convergence: Very Fast Training of Neural Networks Using
+    Large Learning Rates <https://arxiv.org/abs/1708.07120>`_, 2017
+
+  Args:
+    transition_steps: Number of steps over which annealing takes place.
+    peak_value: Maximum value attained by schedule at pct_start percent of the
+      cycle (in number of steps).
+    pct_start: The percentage of the cycle (in number of steps) spent increasing
+      the learning rate.
+    div_factor: Determines the initial value via ``init_value = peak_value /
+      div_factor``.
+    final_div_factor: Determines the final value via ``final_value = init_value
+      / final_div_factor``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  if transition_steps <= 0:
+    raise ValueError(
+        'A linear onecycle schedule was set with a non-positive '
+        '`transition_steps`'
+    )
+
+  return piecewise_interpolate_schedule(
+      'cosine',
+      peak_value / div_factor,
+      {
+          int(pct_start * transition_steps): div_factor,
+          int(transition_steps): 1.0 / (div_factor * final_div_factor),
+      },
+  )
+
+
+def warmup_constant_schedule(
+    init_value: float,
+    peak_value: float,
+    warmup_steps: int,
+) -> base.Schedule:
+  r"""Linear warmup followed by constant schedule i.e no decay.
+
+  Args:
+    init_value: Initial value for the scalar to be annealed.
+    peak_value: Peak value for scalar to be annealed at end of warmup.
+    warmup_steps: Positive integer, the length of the linear warmup.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  return linear_schedule(
+      init_value=init_value,
+      end_value=peak_value,
+      transition_steps=warmup_steps,
+  )
+
+
+def warmup_cosine_decay_schedule(
+    init_value: float,
+    peak_value: float,
+    warmup_steps: int,
+    decay_steps: int,
+    end_value: float = 0.0,
+    exponent: float = 1.0,
+) -> base.Schedule:
+  r"""Linear warmup followed by cosine decay.
+
+  Args:
+    init_value: Initial value for the scalar to be annealed.
+    peak_value: Peak value for scalar to be annealed at end of warmup.
+    warmup_steps: Positive integer, the length of the linear warmup.
+    decay_steps: Positive integer, the total length of the schedule. Note that
+      this includes the warmup time, so the number of steps during which cosine
+      annealing is applied is ``decay_steps - warmup_steps``.
+    end_value: End value of the scalar to be annealed.
+    exponent: The default decay is ``0.5 * (1 + cos(pi t/T))``, where
+      ``t`` is the current timestep and ``T`` is ``decay_steps``. The exponent
+      modifies this to be ``(0.5 * (1 + cos(pi * t/T))) ** exponent``. Defaults
+      to 1.0.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  alpha = 0.0 if peak_value == 0.0 else end_value / peak_value
+  schedules = [
+      linear_schedule(
+          init_value=init_value,
+          end_value=peak_value,
+          transition_steps=warmup_steps,
+      ),
+      cosine_decay_schedule(
+          init_value=peak_value,
+          decay_steps=decay_steps - warmup_steps,
+          alpha=alpha,
+          exponent=exponent,
+      ),
+  ]
+  return join_schedules(schedules, [warmup_steps])
+
+
+def warmup_exponential_decay_schedule(
+    init_value: float,
+    peak_value: float,
+    warmup_steps: int,
+    transition_steps: int,
+    decay_rate: float,
+    transition_begin: int = 0,
+    staircase: bool = False,
+    end_value: Optional[float] = None,
+) -> base.Schedule:
+  """Linear warmup followed by exponential decay.
+
+  Args:
+    init_value: Initial value for the scalar to be annealed.
+    peak_value: Peak value for scalar to be annealed at end of warmup.
+    warmup_steps: Positive integer, the length of the linear warmup.
+    transition_steps: must be positive. See :func:`optax.exponential_decay` for
+      more details.
+    decay_rate: must not be zero. The decay rate.
+    transition_begin: must be positive. After how many steps to start annealing
+      (before this many steps the scalar value is held fixed at ``peak_value``).
+    staircase: if ``True``, decay the values at discrete intervals.
+    end_value: the value at which the exponential decay stops. When ``decay_rate
+      < 1``, ``end_value`` is treated as a lower bound, otherwise as an upper
+      bound. Has no effect when ``decay_rate = 0``.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  schedules = [
+      linear_schedule(
+          init_value=init_value,
+          end_value=peak_value,
+          transition_steps=warmup_steps,
+      ),
+      exponential_decay(
+          init_value=peak_value,
+          transition_steps=transition_steps,
+          decay_rate=decay_rate,
+          transition_begin=transition_begin,
+          staircase=staircase,
+          end_value=end_value,
+      ),
+  ]
+  return join_schedules(schedules, [warmup_steps])
+
+
+def sgdr_schedule(
+    cosine_kwargs: Iterable[dict[str, chex.Numeric]],
+) -> base.Schedule:
+  """SGD with warm restarts.
+
+  This learning rate schedule applies multiple joined cosine decay cycles.
+
+  References:
+    Loshchilov et al., `SGDR: Stochastic Gradient Descent with Warm Restarts
+    <https://arxiv.org/abs/1608.03983>`_, 2017
+
+  Args:
+    cosine_kwargs: An Iterable of dicts, where each element specifies the
+      arguments to pass to each cosine decay cycle. The ``decay_steps`` kwarg
+      will specify how long each cycle lasts for, and therefore when to
+      transition to the next cycle.
+
+  Returns:
+    schedule
+      A function that maps step counts to values
+  """
+  boundaries = []
+  schedules = []
+  step = 0
+  for kwargs in cosine_kwargs:
+    schedules += [warmup_cosine_decay_schedule(**kwargs)]
+    boundaries += [step + kwargs['decay_steps']]
+    step += kwargs['decay_steps']
+  return join_schedules(schedules, boundaries[:-1])
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/schedule_free_optax.py b/tests/test_algorithms/schedule_free_adamw/jax/schedule_free_optax.py
new file mode 100644
index 000000000..43808948d
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/jax/schedule_free_optax.py
@@ -0,0 +1,657 @@
+# Copyright 2024 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Schedule-Free wrapper for faster training & removes the need for lr decay."""
+
+from typing import NamedTuple, Optional
+
+import chex
+import jax
+import jax.numpy as jnp
+from optax._src import alias
+from optax._src import base
+from optax._src import combine
+from optax._src import transform
+# from optax.schedules import _schedule
+from .schedule import warmup_constant_schedule
+from optax._src import numerics
+# from optax.transforms import _adding
+
+
+from collections.abc import Callable
+from typing import Any, NamedTuple, Optional, Union
+
+import chex
+import jax
+import jax.numpy as jnp
+
+# from optax import tree_utils as otu
+from optax._src import base
+# from optax._src import numerics
+from optax._src import wrappers
+
+import functools
+
+Schedule = Callable[[chex.Numeric], chex.Numeric]
+ScheduleState = Any
+ScalarOrSchedule = Union[float, jax.Array, Schedule]
+
+
+def safe_increment(count: chex.Numeric) -> chex.Numeric:
+  """Increments counter by one while avoiding overflow.
+
+  Denote ``max_val``, ``min_val`` as the maximum, minimum, possible values for
+  the ``dtype`` of ``count``. Normally ``max_val + 1`` would overflow to
+  ``min_val``. This functions ensures that when ``max_val`` is reached the
+  counter stays at ``max_val``.
+
+  Examples:
+    >>> import jax.numpy as jnp
+    >>> import optax
+    >>> optax.safe_increment(jnp.asarray(1, dtype=jnp.int32))
+    Array(2, dtype=int32)
+    >>> optax.safe_increment(jnp.asarray(2147483647, dtype=jnp.int32))
+    Array(2147483647, dtype=int32)
+
+  .. versionadded:: 0.2.4
+
+  Args:
+    count: a counter to be incremented.
+
+  Returns:
+    A counter incremented by 1, or ``max_val`` if the maximum value is
+    reached.
+  """
+  count_dtype = jnp.asarray(count).dtype
+  if jnp.issubdtype(count_dtype, jnp.integer):
+    max_value = jnp.iinfo(count_dtype).max
+  elif jnp.issubdtype(count_dtype, jnp.floating):
+    max_value = jnp.finfo(count_dtype).max
+  else:
+    raise ValueError(
+        f'Cannot safely increment count with dtype {count_dtype},'
+        ' valid dtypes are subdtypes of "jnp.integer" or "jnp.floating".'
+    )
+  max_value = jnp.array(max_value, count_dtype)
+  one = jnp.array(1, count_dtype)
+  return jnp.where(count < max_value, count + one, max_value)
+
+class ScaleByRmsWithCountState(NamedTuple):
+  """State for exponential root mean-squared (RMS)-normalized updates."""
+  count: chex.Array  # shape=(), dtype=jnp.int32.
+  nu: base.Updates
+
+
+class ScaleByScheduleState(NamedTuple):
+  """Maintains count for scale scheduling."""
+  count: chex.Array  # shape=(), dtype=jnp.int32
+
+def scale_by_schedule(
+    step_size_fn: base.Schedule
+) -> base.GradientTransformation:
+  """Scale updates using a custom schedule for the `step_size`.
+
+  Args:
+    step_size_fn: A function that takes an update count as input and proposes
+      the step_size to multiply the updates by.
+
+  Returns:
+    A `GradientTransformation` object.
+  """
+
+  def init_fn(params):
+    del params
+    return ScaleByScheduleState(count=jnp.zeros([], jnp.int32))
+
+  def update_fn(updates, state, params=None):
+    del params
+    step_size = step_size_fn(state.count)
+    updates = jax.tree_util.tree_map(
+        lambda g: jnp.array(step_size, dtype=g.dtype) * g, updates)
+    return updates, ScaleByScheduleState(
+        count=safe_increment(state.count))
+
+  return base.GradientTransformation(init_fn, update_fn)
+
+def scale_by_learning_rate(
+    learning_rate: ScalarOrSchedule,
+    *,
+    flip_sign: bool = True,
+) -> base.GradientTransformation:
+  """Scale by the (negative) learning rate (either as scalar or as schedule).
+
+  Args:
+    learning_rate: Can either be a scalar or a schedule (i.e. a callable that
+      maps an (int) step to a float).
+    flip_sign: When set to True (the default) this corresponds to scaling by the
+      negative learning rate.
+
+  Returns:
+    An optax.GradientTransformation that corresponds to multiplying the gradient
+    with `-learning_rate` (if flip_sign is True) or with `learning_rate` (if
+    flip_sign is False).
+  """
+  m = -1 if flip_sign else 1
+  if callable(learning_rate):
+    return scale_by_schedule(lambda count: m * learning_rate(count))
+  return scale(m * learning_rate)
+
+class EmptyState(NamedTuple):
+  """An empty state for the simplest stateless transformations."""
+
+def init_empty_state(params) -> EmptyState:
+  """Init function for a :class:`GradientTransformation` with empty state."""
+  del params
+  return EmptyState()
+
+def tree_full_like(
+    tree: Any,
+    fill_value: jax.typing.ArrayLike,
+    dtype: Optional = None,
+) -> Any:
+  """Creates an identical tree where all tensors are filled with ``fill_value``.
+  
+  Args:
+    tree: pytree.
+    fill_value: the fill value for all tensors in the tree.
+    dtype: optional dtype to use for the tensors in the tree.
+
+  Returns:
+    an tree with the same structure as ``tree``.
+  """
+  return jax.tree_util.tree_map(
+      lambda x: jnp.full_like(x, fill_value, dtype=dtype), tree)
+
+def tree_update_moment_per_elem_norm(updates, moments, decay, order):
+  """Compute the EMA of the `order`-th moment of the element-wise norm."""
+
+  def orderth_norm(g):
+    if jnp.isrealobj(g):
+      return g ** order
+    else:
+      half_order = order / 2
+      # JAX generates different HLO for int and float `order`
+      if half_order.is_integer():
+        half_order = int(half_order)
+      return numerics.abs_sq(g) ** half_order
+
+  return jax.tree_util.tree_map(
+      lambda g, t: (
+          (1 - decay) * orderth_norm(g) + decay * t if g is not None else None
+      ),
+      updates,
+      moments,
+      is_leaf=lambda x: x is None,
+  )
+
+@functools.partial(jax.jit, inline=True)
+def tree_bias_correction(moment, decay, count):
+  """Performs bias correction. It becomes a no-op as count goes to infinity."""
+  # The conversion to the data type of the moment ensures that bfloat16 remains
+  # bfloat16 in the optimizer state. This conversion has to be done after
+  # `bias_correction_` is calculated as calculating `decay**count` in low
+  # precision can result in it being rounded to 1 and subsequently a
+  # "division by zero" error.
+  bias_correction_ = 1 - decay**count
+
+  # Perform division in the original precision.
+  return jax.tree_util.tree_map(
+      lambda t: t / bias_correction_.astype(t.dtype), moment)
+
+def scale_by_rms(
+    decay: float = 0.9,
+    eps: float = 1e-8,
+    initial_scale: float = 0.0,
+    eps_in_sqrt: bool = True,
+    bias_correction: bool = False,
+) -> base.GradientTransformation:
+  r"""Rescale updates by the root of the exp. moving avg of the square.
+
+  .. warning::
+    Default behavior of optax's RMSprop (``eps_in_sqrt=True``) differs from
+    Pytorch's implementation and could impact performance.
+    If ``eps_in_sqrt=True``, in the denominator, optax uses
+    :math:`\sqrt{v + \epsilon}` in the denominator whereas PyTorch uses
+    :math:`\sqrt{v} + \epsilon`.
+    Using ``eps_in_sqrt=False`` in optax will match PyTorch's behavior.
+    See
+    https://github.com/google-deepmind/optax/issues/532 for more detail.
+
+  .. note::
+    Using `scale_by_rms(decay=b2, eps_in_sqrt=False, bias_correction=True)`
+    will match the behavior of `scale_by_adam(b1=0, b2=b2)`, while sparing the
+    memory cost of storing the first moment.
+
+  References:
+    Hinton, `Overview of mini-batch gradient descent`
+    <www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_, 2012
+
+  Args:
+    decay: Decay rate for the exponentially weighted average of squared grads.
+    eps: Term added to the denominator to improve numerical stability.
+    initial_scale: Initial value for second moment.
+    eps_in_sqrt: Whether to add ``eps`` in the square root of the
+      denominator or outside the square root.
+    bias_correction: Whether to apply bias correction to the exponentially
+      weighted average of squared grads.
+
+  Returns:
+    A `GradientTransformation` object.
+  """
+
+  def init_fn(params):
+    nu = tree_full_like(params, initial_scale)  # second moment
+    if bias_correction:
+      return ScaleByRmsWithCountState(
+          count=jnp.zeros([], jnp.int32), nu=nu
+      )
+    else:
+      return transform.ScaleByRmsState(nu=nu)
+
+  def update_fn(updates, state, params=None):
+    del params
+    nu = tree_update_moment_per_elem_norm(updates, state.nu, decay, 2)
+    if bias_correction:
+      count_inc = safe_increment(state.count)
+      nu_hat = tree_bias_correction(nu, decay, count_inc)
+    else:
+      count_inc = jnp.asarray(0)
+      nu_hat = nu
+    if eps_in_sqrt:
+      scaling = jax.tree_util.tree_map(lambda n: jax.lax.rsqrt(n + eps), nu_hat)
+    else:
+      scaling = jax.tree_util.tree_map(lambda n: 1/(jnp.sqrt(n) + eps), nu_hat)
+    updates = jax.tree_util.tree_map(lambda s, g: s * g, scaling, updates)
+    if bias_correction:
+      new_state = ScaleByRmsWithCountState(count=count_inc, nu=nu)
+    else:
+      new_state = transform.ScaleByRmsState(nu=nu)
+    return updates, new_state
+
+  return base.GradientTransformation(init_fn, update_fn)
+
+def add_decayed_weights(
+    weight_decay: Union[float, jax.Array] = 0.0,
+    mask: Optional[Union[Any, Callable[[base.Params], Any]]] = None
+) -> base.GradientTransformation:
+  """Add parameter scaled by `weight_decay`.
+
+  Args:
+    weight_decay: A scalar weight decay rate.
+    mask: A tree with same structure as (or a prefix of) the params PyTree,
+      or a Callable that returns such a pytree given the params/updates.
+      The leaves should be booleans, `True` for leaves/subtrees you want to
+      apply the transformation to, and `False` for those you want to skip.
+
+  Returns:
+    A `GradientTransformation` object.
+  """
+
+  def update_fn(updates, state, params):
+    if params is None:
+      raise ValueError(base.NO_PARAMS_MSG)
+    updates = jax.tree_util.tree_map(
+        lambda g, p: g + weight_decay * p, updates, params)
+    return updates, state
+
+  # If mask is not `None`, apply mask to the gradient transformation.
+  # E.g. it is common to skip weight decay on bias units and batch stats.
+  if mask is not None:
+    return wrappers.masked(
+        base.GradientTransformation(init_empty_state, update_fn), mask)
+  return base.GradientTransformation(init_empty_state, update_fn)
+
+class ScheduleFreeState(NamedTuple):
+  """State for schedule_free."""
+
+  b1: chex.Array
+  weight_sum: chex.Array
+  polyak_weight: chex.Array
+  step_count: chex.Array
+  max_lr: chex.Array
+  base_optimizer_state: base.OptState
+  z: base.Params
+
+
+def schedule_free_eval_params(state: ScheduleFreeState, params: base.Params):
+  """Params for evaluation of :func:`optax.contrib.schedule_free`."""
+  return jax.tree_util.tree_map(
+      lambda yi, zi: (yi - (1.0 - state.b1) * zi) / state.b1, params, state.z
+  )
+
+
+def schedule_free(
+    base_optimizer: base.GradientTransformation,
+    learning_rate: ScalarOrSchedule,
+    b1: float = 0.9,
+    weight_lr_power: float = 2.0,
+    adjusted_polyak_weight_exp: float = 0.75,
+    state_dtype=jnp.float32,
+) -> base.GradientTransformationExtraArgs:
+  r"""Turn base_optimizer schedule_free.
+
+  Accumulates updates returned by the base_optimizer w/o Momentum and
+  replaces the momentum of an underlying optimizer with a combination of
+  interpolation and averaging. In the case of gradient descent the update is
+
+  .. math::
+
+    \begin{align*}
+      y_{t} & = (1-\beta_1)z_{t} + \beta_1 x_{t},\\
+      z_{t+1} & =z_{t}-\gamma\nabla f(y_{t}),\\
+      x_{t+1} & =\left(1-\frac{1}{t}\right)x_{t}+\frac{1}{t}z_{t+1},
+    \end{align*}
+
+  Here :math:`x` is the sequence that evaluations of test/val loss should occur
+  at,  which differs from the primary iterates :math:`z` and the gradient
+  evaluation locations :math:`y`. The updates to :math:`z` correspond to the
+  underlying optimizer, in this case a simple gradient step. Note that,
+  :math:`\beta_1` corresponds to `b1` in the code.
+
+  As the name suggests, Schedule-Free learning does not require a decreasing
+  learning rate schedule, yet typically out-performs, or at worst matches, SOTA
+  schedules such as cosine-decay and linear decay. Only two sequences need to be
+  stored at a time (the third can be computed from the other two on the fly) so
+  this method has the same memory requirements as the base optimizer (parameter
+  buffer + momentum).
+
+  In practice, authors recommend tuning :math:`\beta_1`, `warmup_steps` and
+  `peak_lr` for each problem seperately. Default for :math:`\beta_1` is 0.9 but
+  `0.95` and `0.98` may also work well. Schedule-Free can be wrapped on top of
+  any optax optimizer. At test time, the parameters should be evaluated using
+  :func:`optax.contrib.schedule_free_eval_params` as presented below.
+
+  For example, change this::
+
+    learning_rate_fn = optax.warmup_cosine_decay_schedule(peak_value=tuned_lr)
+    optimizer = optax.adam(learning_rate_fn, b1=b1)
+
+  To::
+
+    learning_rate_fn = optax.warmup_constant_schedule(peak_value=retuned_lr)
+    optimizer = optax.adam(learning_rate_fn, b1=0.)
+    optimizer = optax.contrib.schedule_free(optimizer, learning_rate_fn, b1=b1)
+    ..
+    params_for_eval = optax.contrib.schedule_free_eval_params(state, params)
+
+  Especially note that is important to switch off Momentum of the base
+  optimizer. As of Apr, 2024, schedule_free is tested with SGD and Adam.
+
+  References:
+    Defazio et al, `The Road Less Scheduled
+    <https://arxiv.org/abs/2405.15682>`_, 2024
+
+    Defazio et al, `Schedule-Free Learning - A New Way to Train
+    <https://github.com/facebookresearch/schedule_free/tree/main>`_, 2024
+
+  Args:
+    base_optimizer: Base optimizer to compute updates from.
+    learning_rate: learning_rate schedule w/o decay but with warmup.
+    b1: beta_1 parameter in the y update.
+    weight_lr_power: we downweight the weight of averaging using this. This is
+      especially helpful in early iterations during warmup.
+    adjusted_polyak_weight_exp: the polyak average weight is adjusted by this
+      exponent of the step_count.
+    state_dtype: dtype for z sequence in the schedule free method.
+
+  Returns:
+    A `GradientTransformationExtraArgs` with init and update functions.
+  """
+  base_optimizer = base.with_extra_args_support(base_optimizer)
+
+  def init_fn(params: base.Params) -> ScheduleFreeState:
+    if b1 == 0:
+      raise ValueError(
+          'The current implementation of schedule_free requires b1 > 0.')
+    z = jax.tree_util.tree_map(lambda t: t.astype(state_dtype), params)
+    return ScheduleFreeState(
+        b1=jnp.array(b1, dtype=jnp.float32),
+        polyak_weight=jnp.zeros([], dtype=jnp.float32),
+        weight_sum=jnp.zeros([], dtype=jnp.float32),
+        step_count=jnp.ones([], dtype=jnp.int32),
+        max_lr=jnp.zeros([], dtype=jnp.float32),
+        base_optimizer_state=base_optimizer.init(params),
+        z=z,
+    )
+
+  def update_fn(
+      grads: base.Updates,
+      state: ScheduleFreeState,
+      params: Optional[base.Params] = None,
+      **extra_args,
+  ):
+    lr = learning_rate
+    if callable(learning_rate):
+      lr = learning_rate(state.step_count)
+    max_lr = jnp.maximum(state.max_lr, lr)
+
+    next_step_count = state.step_count + 1
+
+    weight = (next_step_count**adjusted_polyak_weight_exp) * (
+        max_lr**weight_lr_power
+    )
+    next_total_weight = state.weight_sum + weight
+    # We add this to avoid NaNs in the case of a small learning rate.
+    ck = jnp.where(
+        jnp.logical_or(jnp.isnan(weight), jnp.isnan(next_total_weight)),
+        jnp.full(weight.shape, jnp.nan),
+        jnp.nan_to_num(weight / next_total_weight, nan=0.0, posinf=jnp.inf),
+    )
+
+    base_updates, next_base_optimizer_state = base_optimizer.update(
+        grads,
+        state.base_optimizer_state,
+        params,
+        **extra_args,
+    )
+    z = jax.tree_util.tree_map(
+        lambda pi, ui: jnp.asarray(pi + ui).astype(jnp.asarray(pi).dtype),
+        state.z,
+        base_updates,
+    )
+
+    # Important: recompute x to both save memory and maintain accurate x seq
+    # especially if y is modified by another transform wrapped on top.
+    prev_x = jax.tree_util.tree_map(
+        lambda yi, zi: (yi - (1.0 - b1) * zi) / b1, params, state.z
+    )
+
+    x = jax.tree_util.tree_map(
+        lambda xi, zi: (1.0 - ck) * xi + ck * zi,
+        prev_x,
+        z,
+    )
+    new_params = jax.tree_util.tree_map(
+        lambda xi, zi: b1 * xi + (1.0 - b1) * zi,
+        x,
+        z,
+    )
+    updates = jax.tree_util.tree_map(
+        lambda npi, pi: npi - pi, new_params, params
+    )
+
+    next_state = ScheduleFreeState(
+        b1=jnp.array(b1, dtype=jnp.float32),
+        weight_sum=next_total_weight,
+        polyak_weight=ck,
+        step_count=next_step_count,
+        max_lr=max_lr,
+        base_optimizer_state=next_base_optimizer_state,
+        z=z,
+    )
+
+    return updates, next_state
+
+  return base.GradientTransformationExtraArgs(init_fn, update_fn)
+
+
+def schedule_free_sgd(
+    learning_rate: float = 1.0,
+    *,
+    warmup_steps: int = 0,
+    b1: float = 0.9,
+    weight_decay: float = 0.0,
+    weight_lr_power: float = 2.0,
+    adjusted_polyak_weight_exp: float = 0.75,
+    state_dtype=jnp.float32,
+) -> base.GradientTransformationExtraArgs:
+  """Schedule-Free wrapper for SGD.
+
+  Shortcut example for using schedule_free with SGD, which is a common use case.
+  Note that this is just an example, and other use cases are possible, e.g.
+  using a weight decay mask. Note also that the EMA parameter of the
+  schedule free method (b1) must be strictly positive.
+
+  Args:
+    learning_rate: SGD learning rate.
+    warmup_steps: positive integer, the length of the linear warmup.
+    b1: beta_1 parameter in the y update.
+    weight_decay: Strength of the weight decay regularization. Note that this
+      weight decay is multiplied with the learning rate. This is consistent
+      with other frameworks such as PyTorch, but different from
+      (Loshchilov et al, 2019) where the weight decay is only multiplied with
+      the "schedule multiplier", but not the base learning rate.
+    weight_lr_power: we downweight the weight of averaging using this. This is
+      especially helpful in early iterations during warmup.
+    adjusted_polyak_weight_exp: the polyak average weight is adjusted by this
+      exponent of the step_count.
+    state_dtype: dtype for z sequence in the schedule free method.
+
+  Returns:
+    A `GradientTransformationExtraArgs` with init and update functions.
+
+  Examples:
+    >>> import optax
+    >>> import jax
+    >>> import jax.numpy as jnp
+    >>> def f(x): return jnp.sum(x ** 2)  # simple quadratic function
+    >>> solver = optax.contrib.schedule_free_sgd()
+    >>> params = jnp.array([1., 2., 3.])
+    >>> print('Objective function: ', f(params))
+    Objective function:  14.0
+    >>> opt_state = solver.init(params)
+    >>> for _ in range(5):
+    ...  grad = jax.grad(f)(params)
+    ...  updates, opt_state = solver.update(grad, opt_state, params)
+    ...  params = optax.apply_updates(params, updates)
+    ...  eval_params = optax.contrib.schedule_free_eval_params(
+    ...      opt_state, params)
+    ...  print('Objective function: {:.2E}'.format(f(eval_params)))
+    Objective function: 1.40E+01
+    Objective function: 1.75E-14
+    Objective function: 9.96E-01
+    Objective function: 8.06E-01
+    Objective function: 2.41E-01
+  """
+  if warmup_steps > 0:
+    learning_rate = warmup_constant_schedule(
+        init_value=0,
+        peak_value=learning_rate,
+        warmup_steps=warmup_steps,
+    )
+  optimizer = alias.sgd(learning_rate)
+  if weight_decay > 0:
+    optimizer = combine.chain(
+        add_decayed_weights(weight_decay), optimizer)
+  return schedule_free(
+      optimizer,
+      learning_rate=learning_rate,
+      b1=b1,
+      weight_lr_power=weight_lr_power,
+      adjusted_polyak_weight_exp=adjusted_polyak_weight_exp,
+      state_dtype=state_dtype,
+  )
+
+
+def schedule_free_adamw(
+    learning_rate: float = 0.0025,
+    *,
+    warmup_steps: int = 0,
+    b1: float = 0.9,
+    b2: float = 0.999,
+    eps: float = 1e-8,
+    weight_decay: float = 0.0,
+    weight_lr_power: float = 2.0,
+    adjusted_polyak_weight_exp: float = 0.75,
+    state_dtype=jnp.float32,
+) -> base.GradientTransformationExtraArgs:
+  """Schedule-Free wrapper for AdamW.
+
+  Shortcut example for using schedule_free with AdamW, which is a common use
+  case. Note that this is just an example, and other usecases are possible, e.g.
+  using a weight decay mask, nesterov, etc. Note also that the EMA parameter of
+  the schedule free method (b1) must be strictly positive.
+
+  Args:
+    learning_rate: AdamW learning rate.
+    warmup_steps: positive integer, the length of the linear warmup.
+    b1: beta_1 parameter in the y update.
+    b2: Exponential decay rate to track the second moment of past gradients.
+    eps: A small constant applied to denominator outside of the square root
+      (as in the Adam paper) to avoid dividing by zero when rescaling.
+    weight_decay: Strength of the weight decay regularization.
+    weight_lr_power: we downweight the weight of averaging using this. This is
+      especially helpful in early iterations during warmup.
+    adjusted_polyak_weight_exp: the polyak average weight is adjusted by this
+      exponent of the step_count.
+    state_dtype: dtype for z sequence in the schedule free method.
+
+  Returns:
+    A `GradientTransformationExtraArgs` with init and update functions.
+
+  Examples:
+    >>> import optax
+    >>> import jax
+    >>> import jax.numpy as jnp
+    >>> def f(x): return jnp.sum(x ** 2)  # simple quadratic function
+    >>> solver = optax.contrib.schedule_free_adamw(1.0)
+    >>> params = jnp.array([1., 2., 3.])
+    >>> print('Objective function: ', f(params))
+    Objective function:  14.0
+    >>> opt_state = solver.init(params)
+    >>> for _ in range(5):
+    ...  grad = jax.grad(f)(params)
+    ...  updates, opt_state = solver.update(grad, opt_state, params)
+    ...  params = optax.apply_updates(params, updates)
+    ...  eval_params = optax.contrib.schedule_free_eval_params(
+    ...      opt_state, params)
+    ...  print('Objective function: {:.2E}'.format(f(eval_params)))
+    Objective function: 5.00E+00
+    Objective function: 3.05E+00
+    Objective function: 1.73E+00
+    Objective function: 8.94E-01
+    Objective function: 4.13E-01
+  """
+  if warmup_steps > 0:
+    learning_rate = warmup_constant_schedule(
+        init_value=0,
+        peak_value=learning_rate,
+        warmup_steps=warmup_steps,
+    )
+  # The following is the same as adamw, but with the momentum term removed.
+  optimizer = combine.chain(
+      scale_by_rms(
+          decay=b2, eps=eps, eps_in_sqrt=False, bias_correction=True
+      ),
+      add_decayed_weights(weight_decay),
+      scale_by_learning_rate(learning_rate)
+  )
+  return schedule_free(
+      optimizer,
+      learning_rate=learning_rate,
+      b1=b1,
+      weight_lr_power=weight_lr_power,
+      adjusted_polyak_weight_exp=adjusted_polyak_weight_exp,
+      state_dtype=state_dtype,
+  )
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/submission.py b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
new file mode 100644
index 000000000..8823826fc
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
@@ -0,0 +1,222 @@
+"""Submission file for an Schedule Free AdamW optimizer in Jax."""
+
+import functools
+from typing import Dict, Iterator, List, Tuple
+import optax
+
+from flax import jax_utils
+import jax
+from jax import lax
+import jax.numpy as jnp
+from .schedule_free_optax import schedule_free_adamw
+from algoperf import spec
+
+_GRAD_CLIP_EPS = 1e-6
+
+HPARAMS = {
+    "dropout_rate": 0.1,
+    "learning_rate": 0.0025,
+    "one_minus_beta1": 0.1,
+    "beta2": 0.9955159689799007,
+    "weight_decay": 0.08121616522670176,
+    "warmup_factor": 0.02,
+    "weight_lr_power": 2,
+    "label_smoothing": 0.2,
+    "r": 0.75,
+    "eps": 1e-8,
+}
+
+def init_optimizer_state(workload: spec.Workload,
+                         model_params: spec.ParameterContainer,
+                         model_state: spec.ModelAuxiliaryState,
+                         hyperparameters: spec.Hyperparameters,
+                         rng: spec.RandomState) -> spec.OptimizerState:
+  """Creates an AdamW optimizer and a learning rate schedule."""
+  del model_params
+  del model_state
+  del rng
+  lr=HPARAMS['learning_rate']
+  betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2'])
+  warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75)
+  weight_decay=HPARAMS['weight_decay']
+  weight_lr_power=HPARAMS['weight_lr_power']
+  r=HPARAMS['r']
+
+  opt_init_fn, opt_update_fn = schedule_free_adamw(
+      learning_rate=HPARAMS['learning_rate'],
+      warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75),
+
+      b1=1.0 - HPARAMS['one_minus_beta1'],
+      b2=HPARAMS['beta2'],
+      eps=HPARAMS['eps'],
+      weight_decay=HPARAMS['weight_decay'],
+      adjusted_polyak_weight_exp=HPARAMS['r'],
+      weight_lr_power=HPARAMS['weight_lr_power'],
+  )
+  params_zeros_like = jax.tree_map(lambda s: jnp.zeros(s.shape_tuple),
+                                   workload.param_shapes)
+  optimizer_state = opt_init_fn(params_zeros_like)
+
+  return jax_utils.replicate(optimizer_state), opt_update_fn
+
+
+@functools.partial(
+    jax.pmap,
+    axis_name='batch',
+    in_axes=(None, None, 0, 0, 0, 0, 0, None, None),
+    static_broadcasted_argnums=(0, 1),
+    donate_argnums=(2, 3, 4))
+def pmapped_train_step(workload,
+                       opt_update_fn,
+                       model_state,
+                       optimizer_state,
+                       current_param_container,
+                       batch,
+                       rng,
+                       grad_clip,
+                       label_smoothing):
+
+  def _loss_fn(params):
+    """Loss function used for training."""
+    logits, new_model_state = workload.model_fn(
+        params,
+        batch,
+        model_state,
+        spec.ForwardPassMode.TRAIN,
+        rng,
+        update_batch_norm=True)
+    loss_dict = workload.loss_fn(
+        label_batch=batch['targets'],
+        logits_batch=logits,
+        mask_batch=batch.get('weights'),
+        label_smoothing=label_smoothing)
+    summed_loss = loss_dict['summed']
+    n_valid_examples = loss_dict['n_valid_examples']
+    return summed_loss, (n_valid_examples, new_model_state)
+
+  grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
+  (summed_loss, (n_valid_examples, new_model_state)), grad = grad_fn(
+      current_param_container)
+  # Get correct global mean loss and grad.
+  (summed_loss, n_valid_examples, grad) = lax.psum(
+      (summed_loss, n_valid_examples, grad), axis_name='batch')
+  loss = summed_loss / n_valid_examples
+  grad = jax.tree_map(lambda x: x / n_valid_examples, grad)
+
+  grad_norm = jnp.sqrt(
+      sum(jnp.sum(g**2) for g in jax.tree_util.tree_leaves(grad)))
+
+  # Extract the leaves of the pytree
+  leaves = jax.tree_util.tree_leaves(grad)
+  # Count the total number of elements in all leaves
+  total_size = sum(jnp.size(leaf) for leaf in leaves)
+
+  jax.debug.print('GRAD NORM {}', grad_norm)
+  jax.debug.print('NUM PARAMS {}', total_size)
+
+  if grad_clip is not None:
+    grad_scaling_factor = grad_clip / (grad_norm + _GRAD_CLIP_EPS)
+    grad_scaling_factor = jax.lax.clamp(min=0.0, x=grad_scaling_factor, max=1.0)
+    grad = jax.tree_map(lambda x: x * grad_scaling_factor, grad)
+
+  updates, new_optimizer_state = opt_update_fn(grad, optimizer_state,
+                                               current_param_container)
+  updated_params = optax.apply_updates(current_param_container, updates)
+  return new_optimizer_state, updated_params, new_model_state, loss, grad_norm
+
+
+def update_params(workload: spec.Workload,
+                  current_param_container: spec.ParameterContainer,
+                  current_params_types: spec.ParameterTypeTree,
+                  model_state: spec.ModelAuxiliaryState,
+                  hyperparameters: spec.Hyperparameters,
+                  batch: Dict[str, spec.Tensor],
+                  loss_type: spec.LossType,
+                  optimizer_state: spec.OptimizerState,
+                  eval_results: List[Tuple[int, float]],
+                  global_step: int,
+                  rng: spec.RandomState) -> spec.UpdateReturn:
+  """Return (updated_optimizer_state, updated_params, updated_model_state)."""
+  del current_params_types
+  del loss_type
+  del eval_results
+
+  optimizer_state, opt_update_fn = optimizer_state
+  per_device_rngs = jax.random.split(rng, jax.local_device_count())
+  if hasattr(hyperparameters, 'label_smoothing'):
+    label_smoothing = hyperparameters.label_smoothing
+  else:
+    label_smoothing = 0.0
+  if hasattr(hyperparameters, 'grad_clip'):
+    grad_clip = hyperparameters.grad_clip
+  else:
+    grad_clip = None
+  outputs = pmapped_train_step(workload,
+                               opt_update_fn,
+                               model_state,
+                               optimizer_state,
+                               current_param_container,
+                               batch,
+                               per_device_rngs,
+                               grad_clip,
+                               label_smoothing)
+  new_optimizer_state, new_params, new_model_state, loss, grad_norm = outputs
+
+  # Log loss, grad_norm.
+  if global_step % 100 == 0 and workload.metrics_logger is not None:
+    workload.metrics_logger.append_scalar_metrics(
+        {
+            'loss': loss[0],
+            'grad_norm': grad_norm[0],
+        }, global_step)
+  return (new_optimizer_state, opt_update_fn), new_params, new_model_state
+
+
+def get_batch_size(workload_name):
+  # Return the global batch size.
+  if workload_name == 'criteo1tb':
+    return 262_144
+  elif workload_name == 'fastmri':
+    return 32
+  elif workload_name == 'imagenet_resnet':
+    return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
+  elif workload_name == 'imagenet_vit':
+    return 1024
+  elif workload_name == 'librispeech_conformer':
+    return 256
+  elif workload_name == 'librispeech_deepspeech':
+    return 256
+  elif workload_name == 'ogbg':
+    return 512
+  elif workload_name == 'wmt':
+    return 128
+  elif workload_name == 'mnist':
+    return 16
+  else:
+    raise ValueError(f'Unsupported workload name: {workload_name}.')
+
+
+def data_selection(workload: spec.Workload,
+                   input_queue: Iterator[Dict[str, spec.Tensor]],
+                   optimizer_state: spec.OptimizerState,
+                   current_param_container: spec.ParameterContainer,
+                   model_state: spec.ModelAuxiliaryState,
+                   hyperparameters: spec.Hyperparameters,
+                   global_step: int,
+                   rng: spec.RandomState) -> Dict[str, spec.Tensor]:
+  """Select data from the infinitely repeating, pre-shuffled input queue.
+  Each element of the queue is a batch of training examples and labels.
+  """
+  del workload
+  del optimizer_state
+  del current_param_container
+  del model_state
+  del hyperparameters
+  del global_step
+  del rng
+  batch = next(input_queue)
+  return batch
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/__init__.py b/tests/test_algorithms/schedule_free_adamw/pytorch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
new file mode 100644
index 000000000..76fae0ca2
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
@@ -0,0 +1,328 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# 
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Dict, Iterator, List, Tuple
+from absl import logging
+import torch
+import torch.distributed.nn as dist_nn
+
+from algorithmic_efficiency import spec
+from algorithmic_efficiency.pytorch_utils import pytorch_setup
+
+USE_PYTORCH_DDP = pytorch_setup()[0]
+HPARAMS = {
+    "dropout_rate": 0.1,
+    "learning_rate": 0.0025,
+    "one_minus_beta1": 0.1,
+    "beta2": 0.9955159689799007,
+    "weight_decay": 0.08121616522670176,
+    "warmup_factor": 0.02,
+    "weight_lr_power": 2,
+    "label_smoothing": 0.2,
+    "r": 0.75,
+    "conformer_bs": 192,
+}
+
+
+class AdamWScheduleFree(torch.optim.Optimizer):
+    r"""Schedule Free AdamW
+    """
+    def __init__(self, params, 
+                 lr=1e-3, 
+                 betas=(0.9, 0.999), 
+                 eps=1e-8,
+                 weight_decay=0,
+                 weight_lr_power=2,
+                 warmup_steps=0,
+                 r=0,
+                 ):
+        defaults = dict(lr=lr, 
+                        betas=betas, 
+                        eps=eps,
+                        r=r,
+                        k=0,
+                        weight_sum=0.0,
+                        lr_max=0.0,
+                        warmup_steps=warmup_steps,
+                        weight_lr_power=weight_lr_power,
+                        weight_decay=weight_decay)
+
+        super().__init__(params, defaults)
+
+    def reset(self):
+        for group in self.param_groups:
+            group['k'] = 0
+            group['lr_max'] = 0 
+            group['weight_sum'] = 0
+
+            for p in group['params']:
+                # State initialization
+                state = self.state[p]
+                state['z'].copy_(state['x0'])
+                p.data.copy_(state['x0'])
+                state['exp_avg_sq'].zero_()
+
+    def step(self, closure):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        # Swap to extrapolated point:
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            r = group['r']
+            k = group['k']
+
+            for p in group['params']:
+                # State initialization
+                state = self.state[p]
+                if 'z' not in state:
+                    state['z'] = torch.clone(p.data)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.bfloat16)
+                    state['x0'] = p.data.cpu()                    
+
+                z = state['z']
+
+                # Extrapolate
+                #p = p + (1-beta1)*(z-p)
+                #p.data.mul_(beta1).add_(z, alpha=1-beta1)
+                p.data.lerp_(end=z, weight=1-beta1)
+
+        # Evaluate gradient at extrapolated point
+        loss = closure()
+
+        for group in self.param_groups:
+            eps = group['eps']
+            k = group['k']
+            warmup_steps = group['warmup_steps']
+            
+            if k < warmup_steps:
+              sched = (k+1) / warmup_steps
+            else:
+              sched = 1.0
+            annealed_lr = group['lr']*sched
+
+            lr = max(annealed_lr, eps)
+
+            decay = group['weight_decay']
+            beta1, beta2 = group['betas']
+            weight_lr_power = group['weight_lr_power']
+            
+            r = group['r']
+            lr_max = group['lr_max'] = max(lr, group['lr_max'])
+            
+            weight = ((k+1)**r) * (lr_max**weight_lr_power)
+            weight_sum = group['weight_sum'] = group['weight_sum'] + weight
+
+            ckp1 = weight/weight_sum
+
+            bias_correction2 = 1 - beta2 ** (k+1)
+            step_size = lr * math.sqrt(bias_correction2)
+
+            num_params = 0
+            grad_norm = 0
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+
+                grad_norm = grad_norm + torch.sum(grad**2)
+                num_params = num_params + torch.numel(p)
+
+                state = self.state[p]
+
+                exp_avg_sq = state['exp_avg_sq']
+                z = state['z']
+
+                # Unextrapolate
+                #p = (p - (1-beta1)*z)/beta1
+                #p.data.sub_(z, alpha=1-beta1).div_(beta1)
+                p.data.lerp_(end=z, weight=1-1/beta1)
+
+                # Decay the first and second moment running average coefficient
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1-beta2)
+                denom = exp_avg_sq.sqrt().add_(eps)
+
+                z.addcdiv_(grad, denom, value=-step_size)
+
+                # Decay
+                z.sub_(p.data, alpha=step_size*decay)
+
+                ### Take step
+                #p.data.mul_(1-ckp1).add_(z, alpha=ckp1)
+                p.data.lerp_(end=z, weight=ckp1)
+
+            group['k'] = k+1
+            grad_norm = torch.sqrt(grad_norm)
+            print('GRAD NORM', grad_norm)
+            print('NUM_PARAMS', num_params)
+
+        return loss
+
+def init_optimizer_state(workload: spec.Workload,
+                         model_params: spec.ParameterContainer,
+                         model_state: spec.ModelAuxiliaryState,
+                         hyperparameters: spec.Hyperparameters,
+                         rng: spec.RandomState) -> spec.OptimizerState:
+  del model_state
+
+  optimizer = AdamWScheduleFree(
+    model_params.parameters(),
+    lr=HPARAMS['learning_rate'],
+    betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2']),
+    warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75),
+    weight_decay=HPARAMS['weight_decay'],
+    weight_lr_power=HPARAMS['weight_lr_power'],
+    r=HPARAMS['r'])
+
+  optimizer_state = {'optimizer':optimizer, 'max_checked_eval_step': -1, 'has_forced_reset': False, 'first_eval': False, }
+  return optimizer_state
+
+def update_params(workload: spec.Workload,
+                  current_param_container: spec.ParameterContainer,
+                  current_params_types: spec.ParameterTypeTree,
+                  model_state: spec.ModelAuxiliaryState,
+                  hyperparameters: spec.Hyperparameters,
+                  batch: Dict[str, spec.Tensor],
+                  loss_type: spec.LossType,
+                  optimizer_state: spec.OptimizerState,
+                  eval_results: List[Tuple[int, float]],
+                  global_step: int,
+                  rng: spec.RandomState) -> spec.UpdateReturn:
+  """Return (updated_optimizer_state, updated_params, updated_model_state)."""
+  del current_params_types
+  del loss_type
+  del hyperparameters
+
+
+  metric_name = workload.target_metric_name
+  # TODO - remove force_reset
+  eval_step = len(eval_results)
+  if (global_step > workload.step_hint*0.10) and optimizer_state['max_checked_eval_step'] < eval_step:
+    optimizer_state['max_checked_eval_step'] = eval_step
+    # Don't do resetting on workloads that don't run eval often enough
+    if len(eval_results) >= 4: # and optimizer_state["first_eval_is_far_from_target"]:
+      val_metric = f"validation/{metric_name}"
+      initial_eval = eval_results[0][1][val_metric]
+      latest_eval = eval_results[-1][1][val_metric]
+      second_latest_eval = eval_results[-2][1][val_metric]
+      third_latest_eval = eval_results[-3][1][val_metric]
+      fourth_latest_eval = eval_results[-4][1][val_metric]
+      MARGIN = 0.01
+      if metric_name in ["loss", "wer"]:
+        # Decreasing eval workloads should be flipped
+        initial_eval = -initial_eval
+        latest_eval = -latest_eval
+        second_latest_eval = -second_latest_eval
+        third_latest_eval = -third_latest_eval
+        fourth_latest_eval = -fourth_latest_eval
+      # Higher is better
+      # scale as a curve from 0 --> 1
+      # if the eval values are far from the target (i.e. - worse than initial) and stays far from the target for 4 evals
+      if (latest_eval - initial_eval < MARGIN) and (latest_eval - second_latest_eval < MARGIN) and (second_latest_eval - third_latest_eval < MARGIN) and (third_latest_eval - fourth_latest_eval < MARGIN):
+        # Reset parameters since we appear to have diverged
+        logging.info("Reseting All Weights ")
+        logging.info(f"Global Step: {global_step}")
+        optimizer_state['has_forced_reset'] = True
+
+        # Perform reset
+        del model_state
+        model_state = None
+        optimizer_state['optimizer'].reset()
+
+        # Decrease learning rate by 2x if it diverged.
+        for param_group in optimizer_state['optimizer'].param_groups:
+          param_group['lr'] = param_group['lr']/2.0
+
+  ###########
+
+  current_model = current_param_container
+  current_model.train()
+
+  new_model_state = None
+
+  def closure():
+    nonlocal new_model_state
+    optimizer_state['optimizer'].zero_grad()
+
+    logits_batch, new_model_state = workload.model_fn(
+        params=current_model,
+        augmented_and_preprocessed_input_batch=batch,
+        model_state=model_state,
+        mode=spec.ForwardPassMode.TRAIN,
+        rng=rng,
+        update_batch_norm=True)
+
+    loss_dict = workload.loss_fn(
+        label_batch=batch['targets'],
+        logits_batch=logits_batch,
+        mask_batch=batch.get('weights'),
+        label_smoothing=HPARAMS['label_smoothing'])
+    summed_loss = loss_dict['summed']
+    n_valid_examples = loss_dict['n_valid_examples']
+    if USE_PYTORCH_DDP:
+      # Use dist_nn.all_reduce to ensure correct loss and gradient scaling.
+      summed_loss = dist_nn.all_reduce(summed_loss)
+      n_valid_examples = dist_nn.all_reduce(n_valid_examples)
+    loss = summed_loss / n_valid_examples
+
+    loss.backward()
+    return loss
+
+  loss = optimizer_state['optimizer'].step(closure)
+
+  return (optimizer_state, current_param_container, new_model_state)
+
+
+def get_batch_size(workload_name):
+  # Return the global batch size.
+  if workload_name == 'criteo1tb':
+    return 262_144
+  elif workload_name == 'fastmri':
+    return 16 # 32
+  elif workload_name == 'imagenet_resnet':
+    return 1024
+  elif workload_name == 'imagenet_vit':
+    return 1024
+  elif workload_name == 'librispeech_conformer':
+    return 224
+  elif workload_name == 'librispeech_deepspeech':
+    return 128 # 256
+  elif workload_name == 'ogbg':
+    return 512
+  elif workload_name == 'wmt':
+    return 128
+  elif workload_name == 'mnist':
+    return 16
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  else:
+    raise ValueError(f'Unsupported workload name: {workload_name}.')
+
+def data_selection(workload: spec.Workload,
+                   input_queue: Iterator[Dict[str, spec.Tensor]],
+                   optimizer_state: spec.OptimizerState,
+                   current_param_container: spec.ParameterContainer,
+                   model_state: spec.ModelAuxiliaryState,
+                   hyperparameters: spec.Hyperparameters,
+                   global_step: int,
+                   rng: spec.RandomState) -> Dict[str, spec.Tensor]:
+  """Select data from the infinitely repeating, pre-shuffled input queue.
+  Each element of the queue is a batch of training examples and labels.
+  """
+  del workload
+  del optimizer_state
+  del current_param_container
+  del model_state
+  del hyperparameters
+  del global_step
+  del rng
+  batch = next(input_queue)
+  return batch
\ No newline at end of file

From ff6a70753b445610c541b0035abf418f88c81181 Mon Sep 17 00:00:00 2001
From: "Yifan(Emma)" <wyf@google.com>
Date: Wed, 3 Sep 2025 03:34:02 +0000
Subject: [PATCH 2/5] Your new commit message here for fastmri

---
 .gitignore                                    |  1 +
 .../schedule_free_adamw/compare_plot.py       | 76 +++++++++++++++++++
 .../data_point_analysis.py                    | 23 ++++++
 .../schedule_free_adamw/jax/submission.py     |  5 +-
 .../schedule_free_adamw/pytorch/submission.py |  4 +-
 5 files changed, 103 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_algorithms/schedule_free_adamw/compare_plot.py
 create mode 100644 tests/test_algorithms/schedule_free_adamw/data_point_analysis.py

diff --git a/.gitignore b/.gitignore
index 7d35f0ccc..72d112762 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ scoring/plots/
 !scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv
 
 algoperf/_version.py
+core*
diff --git a/tests/test_algorithms/schedule_free_adamw/compare_plot.py b/tests/test_algorithms/schedule_free_adamw/compare_plot.py
new file mode 100644
index 000000000..54a79cba7
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/compare_plot.py
@@ -0,0 +1,76 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import subprocess
+import numpy as np
+from scipy.interpolate import make_interp_spline, BSpline
+
+# Paths to CSV files
+pytorch_csv = '/root/experiments/sfadamw6/ogbg_pytorch/trial_1/measurements.csv'
+jax_csv = '/root/experiments/sfadamw7/ogbg_jax/trial_1/measurements.csv'
+
+# Read CSVs
+try:
+    pytorch_df = pd.read_csv(pytorch_csv)
+    print("PyTorch CSV columns:", pytorch_df.columns)
+    jax_df = pd.read_csv(jax_csv)
+    print("JAX CSV columns:", jax_df.columns)
+except FileNotFoundError as e:
+    print(f"Error: Could not find CSV file: {e}")
+    exit()
+except pd.errors.EmptyDataError as e:
+    print(f"Error: CSV file is empty: {e}")
+    exit()
+except Exception as e:
+    print(f"Error reading CSV file: {e}")
+    exit()
+
+# Define the correct column names based on inspection
+x_column = 'global_step'
+y_column = 'validation/loss'
+
+# Create output directory in /tmp which is writable
+output_dir = '/tmp/plots'
+os.makedirs(output_dir, exist_ok=True)
+output_path = os.path.join(output_dir, 'adam_ogbg_plot.png')
+
+# --- FIX: Clean the data by dropping rows with NaN values ---
+pytorch_df_cleaned = pytorch_df.dropna(subset=[y_column])
+jax_df_cleaned = jax_df.dropna(subset=[y_column])
+
+# Print data after cleaning
+print("\nCleaned PyTorch data:")
+print(pytorch_df_cleaned[[x_column, y_column]].head())
+print("\nCleaned JAX data:")
+print(jax_df_cleaned[[x_column, y_column]].head())
+
+# Create the plot
+plt.figure(figsize=(10, 6))
+plt.plot(pytorch_df_cleaned[x_column], pytorch_df_cleaned[y_column], label='PyTorch', marker='o')
+plt.plot(jax_df_cleaned[x_column], jax_df_cleaned[y_column], label='JAX', marker='x')
+
+plt.xlabel('Step')
+plt.ylabel('Metric')
+plt.title('Comparison of PyTorch and JAX Metrics')
+plt.legend()
+plt.grid(True)
+plt.tight_layout()
+
+# Save the plot
+try:
+    plt.savefig(output_path)
+    print(f"Plot saved to: {output_path}")
+except Exception as e:
+    print(f"Error saving plot: {e}")
+
+plt.close()
+
+# Open the plot in the default browser (if possible)
+try:
+    subprocess.run(["$BROWSER", output_path], shell=True, check=True)
+except FileNotFoundError:
+    print("No browser found to open the image.")
+except subprocess.CalledProcessError:
+    print("Error opening the image in the browser.")
+except Exception as e:
+    print(f"Error opening plot in browser: {e}")
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/data_point_analysis.py b/tests/test_algorithms/schedule_free_adamw/data_point_analysis.py
new file mode 100644
index 000000000..85ada5300
--- /dev/null
+++ b/tests/test_algorithms/schedule_free_adamw/data_point_analysis.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+# Paths to CSV files
+pytorch_csv = '/root/experiments/sfadamw6/ogbg_pytorch/trial_1/measurements.csv'
+jax_csv = '/root/experiments/sfadamw6/ogbg_jax/trial_1/measurements.csv'
+
+# Read CSVs
+try:
+    pytorch_df = pd.read_csv(pytorch_csv)
+    jax_df = pd.read_csv(jax_csv)
+except FileNotFoundError as e:
+    print(f"Error: Could not find CSV file: {e}")
+    exit()
+except pd.errors.EmptyDataError as e:
+    print(f"Error: CSV file is empty: {e}")
+    exit()
+except Exception as e:
+    print(f"Error reading CSV file: {e}")
+    exit()
+
+# Print the number of data points
+print(f"Number of data points in PyTorch CSV: {len(pytorch_df)}")
+print(f"Number of data points in JAX CSV: {len(jax_df)}")
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/submission.py b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
index 8823826fc..2b00fed82 100644
--- a/tests/test_algorithms/schedule_free_adamw/jax/submission.py
+++ b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
@@ -32,7 +32,6 @@ def init_optimizer_state(workload: spec.Workload,
                          hyperparameters: spec.Hyperparameters,
                          rng: spec.RandomState) -> spec.OptimizerState:
   """Creates an AdamW optimizer and a learning rate schedule."""
-  del model_params
   del model_state
   del rng
   lr=HPARAMS['learning_rate']
@@ -53,9 +52,7 @@ def init_optimizer_state(workload: spec.Workload,
       adjusted_polyak_weight_exp=HPARAMS['r'],
       weight_lr_power=HPARAMS['weight_lr_power'],
   )
-  params_zeros_like = jax.tree_map(lambda s: jnp.zeros(s.shape_tuple),
-                                   workload.param_shapes)
-  optimizer_state = opt_init_fn(params_zeros_like)
+  optimizer_state = opt_init_fn(jax.device_get(jax_utils.unreplicate(model_params)))
 
   return jax_utils.replicate(optimizer_state), opt_update_fn
 
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
index 76fae0ca2..847cd07eb 100644
--- a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
+++ b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
@@ -9,8 +9,8 @@
 import torch
 import torch.distributed.nn as dist_nn
 
-from algorithmic_efficiency import spec
-from algorithmic_efficiency.pytorch_utils import pytorch_setup
+from algoperf import spec
+from algoperf.pytorch_utils import pytorch_setup
 
 USE_PYTORCH_DDP = pytorch_setup()[0]
 HPARAMS = {

From b4ea236600c98801072b3d851845b65f2e56f373 Mon Sep 17 00:00:00 2001
From: wyfEmma <wyf@google.com>
Date: Thu, 4 Sep 2025 04:08:43 +0000
Subject: [PATCH 3/5] hi

---
 algoperf/workloads/fastmri/input_pipeline.py                  | 4 ++--
 submission_runner.py                                          | 2 +-
 .../test_algorithms/schedule_free_adamw/pytorch/submission.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/algoperf/workloads/fastmri/input_pipeline.py b/algoperf/workloads/fastmri/input_pipeline.py
index f20611f43..a48d8f82a 100644
--- a/algoperf/workloads/fastmri/input_pipeline.py
+++ b/algoperf/workloads/fastmri/input_pipeline.py
@@ -11,8 +11,8 @@
 
 from algoperf import data_utils
 
-_TRAIN_DIR = 'knee_singlecoil_train'
-_VAL_DIR = 'knee_singlecoil_val'
+_TRAIN_DIR = 'fastmri/knee_singlecoil_train'
+_VAL_DIR = 'fastmri/knee_singlecoil_val'
 _EVAL_SEED = 0
 
 
diff --git a/submission_runner.py b/submission_runner.py
index a042bed99..06c3ad823 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -84,7 +84,7 @@
 flags.DEFINE_integer('num_tuning_trials',
                      1,
                      'The number of external hyperparameter trials to run.')
-flags.DEFINE_string('data_dir', '~/data', 'Dataset location.')
+flags.DEFINE_string('data_dir', '/data', 'Dataset location.')
 flags.DEFINE_string('imagenet_v2_data_dir',
                     None,
                     'Dataset location for ImageNet-v2.')
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
index 847cd07eb..a7000255e 100644
--- a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
+++ b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
@@ -284,7 +284,7 @@ def get_batch_size(workload_name):
   if workload_name == 'criteo1tb':
     return 262_144
   elif workload_name == 'fastmri':
-    return 16 # 32
+    return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
   elif workload_name == 'imagenet_vit':

From 6e1ab8e44914be5c8830e324683cad657f3a370c Mon Sep 17 00:00:00 2001
From: wyfEmma <wyf@google.com>
Date: Mon, 8 Sep 2025 03:51:34 +0000
Subject: [PATCH 4/5] change step hint

---
 algoperf/workloads/fastmri/workload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algoperf/workloads/fastmri/workload.py b/algoperf/workloads/fastmri/workload.py
index 051749cc3..1653cc604 100644
--- a/algoperf/workloads/fastmri/workload.py
+++ b/algoperf/workloads/fastmri/workload.py
@@ -104,7 +104,7 @@ def eval_period_time_sec(self) -> int:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 18_094
+    return 36_189
 
   def _build_input_queue(self,
                          data_rng: spec.RandomState,

From 07503bfe8faa9cbfbbf8a6737f84d68dafc337f3 Mon Sep 17 00:00:00 2001
From: "Yifan(Emma)" <wyf@google.com>
Date: Mon, 15 Dec 2025 19:25:57 +0000
Subject: [PATCH 5/5] fix for jax implementation of sfadamw

---
 algoperf/workloads/fastmri/input_pipeline.py  |   4 +-
 .../workloads/imagenet_resnet/workload.py     |   2 +-
 algoperf/workloads/imagenet_vit/workload.py   |   2 +-
 .../librispeech_conformer/workload.py         |   2 +-
 .../librispeech_jax/workload.py               |   2 +-
 .../librispeech_pytorch/workload.py           |   2 +-
 algoperf/workloads/ogbg/workload.py           |   2 +-
 algoperf/workloads/wmt/workload.py            |   2 +-
 docker/scripts/plot.py                        | 128 +++++++
 submission_runner.py                          |   6 +-
 .../schedule_free_adamw/jax/submission.py     | 362 ++++++++++--------
 .../schedule_free_adamw/pytorch/submission.py |  13 +-
 tests/test_traindiffs.py                      |   4 +-
 13 files changed, 350 insertions(+), 181 deletions(-)
 create mode 100644 docker/scripts/plot.py

diff --git a/algoperf/workloads/fastmri/input_pipeline.py b/algoperf/workloads/fastmri/input_pipeline.py
index 1d6ab6f22..62b3219c5 100644
--- a/algoperf/workloads/fastmri/input_pipeline.py
+++ b/algoperf/workloads/fastmri/input_pipeline.py
@@ -11,8 +11,8 @@
 
 from algoperf import data_utils
 
-_TRAIN_DIR = 'fastmri/knee_singlecoil_train'
-_VAL_DIR = 'fastmri/knee_singlecoil_val'
+_TRAIN_DIR = 'knee_singlecoil_train'
+_VAL_DIR = 'knee_singlecoil_val'
 _EVAL_SEED = 0
 
 
diff --git a/algoperf/workloads/imagenet_resnet/workload.py b/algoperf/workloads/imagenet_resnet/workload.py
index ef696e328..e41aee19d 100644
--- a/algoperf/workloads/imagenet_resnet/workload.py
+++ b/algoperf/workloads/imagenet_resnet/workload.py
@@ -145,4 +145,4 @@ def _build_input_queue(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 195_999
+    return 186_666
diff --git a/algoperf/workloads/imagenet_vit/workload.py b/algoperf/workloads/imagenet_vit/workload.py
index 2a0070ba4..a94b6f197 100644
--- a/algoperf/workloads/imagenet_vit/workload.py
+++ b/algoperf/workloads/imagenet_vit/workload.py
@@ -121,4 +121,4 @@ def _build_dataset(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 167_999
+    return 186_666
diff --git a/algoperf/workloads/librispeech_conformer/workload.py b/algoperf/workloads/librispeech_conformer/workload.py
index 791270719..e404a3d40 100644
--- a/algoperf/workloads/librispeech_conformer/workload.py
+++ b/algoperf/workloads/librispeech_conformer/workload.py
@@ -89,4 +89,4 @@ def eval_period_time_sec(self) -> int:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 76_000
+    return 80_000
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py
index 3a320b0dd..5cca37362 100644
--- a/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py
+++ b/algoperf/workloads/librispeech_deepspeech/librispeech_jax/workload.py
@@ -96,7 +96,7 @@ def test_target_value(self) -> float:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 38_400
+    return 48_000
 
   @property
   def max_allowed_runtime_sec(self) -> int:
diff --git a/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py b/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py
index 672f3440f..520c3905d 100644
--- a/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py
+++ b/algoperf/workloads/librispeech_deepspeech/librispeech_pytorch/workload.py
@@ -92,7 +92,7 @@ def test_target_value(self) -> float:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 38_400
+    return 48_000
 
   @property
   def max_allowed_runtime_sec(self) -> int:
diff --git a/algoperf/workloads/ogbg/workload.py b/algoperf/workloads/ogbg/workload.py
index 8717e46d6..077c664ea 100644
--- a/algoperf/workloads/ogbg/workload.py
+++ b/algoperf/workloads/ogbg/workload.py
@@ -144,7 +144,7 @@ def loss_fn(
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 52_000
+    return 80_000
 
   @abc.abstractmethod
   def _normalize_eval_metrics(
diff --git a/algoperf/workloads/wmt/workload.py b/algoperf/workloads/wmt/workload.py
index 40e4262dd..64eb75aea 100644
--- a/algoperf/workloads/wmt/workload.py
+++ b/algoperf/workloads/wmt/workload.py
@@ -98,7 +98,7 @@ def eval_period_time_sec(self) -> int:
   @property
   def step_hint(self) -> int:
     """Approx. steps the baseline can do in the allowed runtime budget."""
-    return 120_000
+    return 133_333
 
   @property
   def pre_ln(self) -> bool:
diff --git a/docker/scripts/plot.py b/docker/scripts/plot.py
new file mode 100644
index 000000000..20a46bfd5
--- /dev/null
+++ b/docker/scripts/plot.py
@@ -0,0 +1,128 @@
+# plot_results.py
+
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+import collections
+
+# --- Configuration ---
+# The base directory to start searching from.
+# The script assumes it's run from the 'experiment_runs' directory.
+SEARCH_DIR = Path("~/experiment_runs/tests/regression_tests/adamw").expanduser()
+
+# The directory where the output plots will be saved.
+OUTPUT_DIR = Path("ssim_plots")
+
+# The columns to use for the x and y axes.
+X_AXIS_COL = "global_step"
+Y_AXIS_CANDIDATES = ["validation/loss", "validation/ctc_loss"]
+# ---------------------
+
+def generate_plots():
+    """
+    Finds all 'measurements.csv' files, groups them by workflow,
+    and generates a JAX vs. PyTorch plot for each.
+    """
+    # Create the output directory if it doesn't already exist
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    print(f"📊 Plots will be saved to the '{OUTPUT_DIR}' directory.")
+
+    # Use a dictionary to group file paths by their workflow name
+    # e.g., {'fastmri': [...], 'wmt': [...]}
+    workflow_files = collections.defaultdict(list)
+
+    # Recursively find all 'measurements.csv' files in the search directory
+    for csv_path in SEARCH_DIR.rglob("measurements.csv"):
+        try:
+            # The directory name looks like 'fastmri_jax' or 'wmt_pytorch'.
+            # We get this from the parent of the parent of the csv file.
+            # e.g., .../fastmri_jax/trial_1/measurements.csv
+            workflow_framework_name = csv_path.parent.parent.name
+            
+            # Split the name to get the framework (last part) and workflow (everything else)
+            parts = workflow_framework_name.split('_')
+            framework = parts[-1]
+            workflow = '_'.join(parts[:-1])
+            
+            # Store the path and framework for this workflow
+            if framework in ['jax', 'pytorch']:
+                workflow_files[workflow].append({'path': csv_path, 'framework': framework})
+
+        except IndexError:
+            # This handles cases where the directory name might not match the expected pattern
+            print(f"⚠️ Could not parse workflow/framework from path: {csv_path}")
+            continue
+
+    if not workflow_files:
+        print("❌ No 'measurements.csv' files found. Check the SEARCH_DIR variable and your folder structure.")
+        return
+
+    print(f"\nFound {len(workflow_files)} workflows. Generating plots...")
+
+    # Iterate through each workflow and its associated files to create a plot
+    for workflow, files in workflow_files.items():
+        plt.style.use('seaborn-v0_8-whitegrid')
+        fig, ax = plt.subplots(figsize=(12, 7))
+
+        print(f"  -> Processing workflow: '{workflow}'")
+
+        y_axis_col_used = None # To store the name of the y-axis column for the plot labels
+        
+        # Plot data for each framework (JAX and PyTorch) on the same figure
+        for item in files:
+            try:
+                df = pd.read_csv(item['path'])
+                
+                y_axis_col = None
+                for candidate in Y_AXIS_CANDIDATES:
+                    if candidate in df.columns:
+                        y_axis_col = candidate
+                        if not y_axis_col_used:
+                            y_axis_col_used = y_axis_col # Set the label from the first file
+                        break # Found a valid column, no need to check further
+                
+                # if item['framework'] == 'jax':
+                #     y_axis_col = None
+
+                # Check if the required columns exist in the CSV
+                if X_AXIS_COL in df.columns and y_axis_col:
+                                        
+                    # 1. Forward-fill 'global_step' to propagate the last valid step downwards.
+                    df[X_AXIS_COL] = df[X_AXIS_COL].ffill()
+                    
+                    # 2. Drop any rows where 'validation/ssim' is empty (NaN).
+                    df_cleaned = df.dropna(subset=[y_axis_col])
+                    
+                    # Plot the cleaned data
+                    ax.plot(
+                        df_cleaned[X_AXIS_COL],
+                        df_cleaned[y_axis_col],
+                        label=item['framework'].capitalize(), # e.g., 'Jax'
+                        marker='.',
+                        linestyle='-',
+                        alpha=0.8
+                    )
+                else:
+                    print(f"     - Skipping {item['path']} (missing required columns).")
+
+            except Exception as e:
+                print(f"     - ❗️ Error reading {item['path']}: {e}")
+
+        # Customize and save the plot
+        ax.set_title(f'Validation loss vs. Global Step for {workflow.replace("_", " ").title()}', fontsize=16)
+        ax.set_xlabel("Global Step", fontsize=12)
+        ax.set_ylabel("Validation loss", fontsize=12)
+        ax.legend(title="Framework", fontsize=10)
+        plt.tight_layout()
+        plt.yscale('log')
+
+        # Define the output filename and save the figure
+        output_filename = OUTPUT_DIR / f"{workflow}_comparison.png"
+        plt.savefig(output_filename, dpi=150)
+        plt.close(fig) # Close the figure to free up memory
+
+    print("\n✅ All plots generated successfully!")
+
+
+if __name__ == "__main__":
+    generate_plots()
\ No newline at end of file
diff --git a/submission_runner.py b/submission_runner.py
index acbdc2c3c..ba47fa0ff 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -19,10 +19,10 @@
 import importlib
 import itertools
 import json
-import jax
 import os
 import struct
 import time
+import optax
 from inspect import signature
 from types import MappingProxyType
 from typing import Any, Dict, Optional, Tuple
@@ -861,9 +861,11 @@ def main(_):
 
 
 if __name__ == '__main__':
+  print(optax.__version__)
+  print("!!!!")
   flags.mark_flag_as_required('workload')
   flags.mark_flag_as_required('framework')
   flags.mark_flag_as_required('submission_path')
   flags.mark_flag_as_required('experiment_dir')
   flags.mark_flag_as_required('experiment_name')
-  app.run(main)
+  app.run(main)
\ No newline at end of file
diff --git a/tests/test_algorithms/schedule_free_adamw/jax/submission.py b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
index 2b00fed82..d7e3e400b 100644
--- a/tests/test_algorithms/schedule_free_adamw/jax/submission.py
+++ b/tests/test_algorithms/schedule_free_adamw/jax/submission.py
@@ -1,219 +1,269 @@
-"""Submission file for an Schedule Free AdamW optimizer in Jax."""
+"""Submission file for a Schedule Free AdamW optimizer in Jax."""
 
-import functools
 from typing import Dict, Iterator, List, Tuple
-import optax
+from functools import partial
 
 from flax import jax_utils
 import jax
-from jax import lax
 import jax.numpy as jnp
-from .schedule_free_optax import schedule_free_adamw
+import optax
+import numpy as np
+from jax.sharding import NamedSharding, PartitionSpec as P
+
 from algoperf import spec
+# Ensure this import matches your file structure
+from .schedule_free_optax import schedule_free_adamw 
 
 _GRAD_CLIP_EPS = 1e-6
 
 HPARAMS = {
-    "dropout_rate": 0.1,
-    "learning_rate": 0.0025,
-    "one_minus_beta1": 0.1,
-    "beta2": 0.9955159689799007,
-    "weight_decay": 0.08121616522670176,
-    "warmup_factor": 0.02,
-    "weight_lr_power": 2,
-    "label_smoothing": 0.2,
-    "r": 0.75,
-    "eps": 1e-8,
+    'dropout_rate': 0.1,
+    'learning_rate': 0.0025,
+    'one_minus_beta1': 0.1,
+    'beta2': 0.9955159689799007,
+    'weight_decay': 0.08121616522670176,
+    'warmup_factor': 0.02,
+    'weight_lr_power': 2,
+    'label_smoothing': 0.2,
+    'r': 0.75,
+    'eps': 1e-8,
 }
 
-def init_optimizer_state(workload: spec.Workload,
-                         model_params: spec.ParameterContainer,
-                         model_state: spec.ModelAuxiliaryState,
-                         hyperparameters: spec.Hyperparameters,
-                         rng: spec.RandomState) -> spec.OptimizerState:
-  """Creates an AdamW optimizer and a learning rate schedule."""
+def init_optimizer_state(
+    workload: spec.Workload,
+    model_params: spec.ParameterContainer,
+    model_state: spec.ModelAuxiliaryState,
+    hyperparameters: spec.Hyperparameters,
+    rng: spec.RandomState,
+) -> spec.OptimizerState:
+  """Creates Schedule Free AdamW optimizer and state."""
   del model_state
   del rng
-  lr=HPARAMS['learning_rate']
-  betas=(1.0 - HPARAMS['one_minus_beta1'], HPARAMS['beta2'])
-  warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75)
-  weight_decay=HPARAMS['weight_decay']
-  weight_lr_power=HPARAMS['weight_lr_power']
-  r=HPARAMS['r']
+  del hyperparameters
 
   opt_init_fn, opt_update_fn = schedule_free_adamw(
       learning_rate=HPARAMS['learning_rate'],
       warmup_steps=int(HPARAMS['warmup_factor'] * workload.step_hint * 0.75),
-
       b1=1.0 - HPARAMS['one_minus_beta1'],
       b2=HPARAMS['beta2'],
       eps=HPARAMS['eps'],
       weight_decay=HPARAMS['weight_decay'],
-      adjusted_polyak_weight_exp=HPARAMS['r'],
       weight_lr_power=HPARAMS['weight_lr_power'],
   )
-  optimizer_state = opt_init_fn(jax.device_get(jax_utils.unreplicate(model_params)))
-
-  return jax_utils.replicate(optimizer_state), opt_update_fn
-
-
-@functools.partial(
-    jax.pmap,
-    axis_name='batch',
-    in_axes=(None, None, 0, 0, 0, 0, 0, None, None),
-    static_broadcasted_argnums=(0, 1),
-    donate_argnums=(2, 3, 4))
-def pmapped_train_step(workload,
-                       opt_update_fn,
-                       model_state,
-                       optimizer_state,
-                       current_param_container,
-                       batch,
-                       rng,
-                       grad_clip,
-                       label_smoothing):
 
+  optimizer_state = opt_init_fn(model_params)
+  return optimizer_state, opt_update_fn
+
+
+def train_step(
+    workload,
+    opt_update_fn,
+    model_state,
+    optimizer_state,
+    current_param_container,
+    batch,
+    rng,
+    grad_clip,
+    label_smoothing,
+    beta1
+):
+  
+  # 1. y = (1-beta1)z + beta1*x
+  z = optimizer_state.z
+  def interpolate(x, z):
+    z = z.astype(x.dtype) 
+    return beta1 * x + (1.0 - beta1) * z
+
+  params_y = jax.tree_util.tree_map(interpolate, current_param_container, z)
+
+  # 2. Loss Function
   def _loss_fn(params):
-    """Loss function used for training."""
     logits, new_model_state = workload.model_fn(
         params,
         batch,
         model_state,
         spec.ForwardPassMode.TRAIN,
         rng,
-        update_batch_norm=True)
+        update_batch_norm=True,
+    )
     loss_dict = workload.loss_fn(
         label_batch=batch['targets'],
         logits_batch=logits,
         mask_batch=batch.get('weights'),
-        label_smoothing=label_smoothing)
+        label_smoothing=label_smoothing,
+    )
     summed_loss = loss_dict['summed']
     n_valid_examples = loss_dict['n_valid_examples']
     return summed_loss, (n_valid_examples, new_model_state)
 
+  # 3. Gradients
   grad_fn = jax.value_and_grad(_loss_fn, has_aux=True)
-  (summed_loss, (n_valid_examples, new_model_state)), grad = grad_fn(
-      current_param_container)
-  # Get correct global mean loss and grad.
-  (summed_loss, n_valid_examples, grad) = lax.psum(
-      (summed_loss, n_valid_examples, grad), axis_name='batch')
-  loss = summed_loss / n_valid_examples
-  grad = jax.tree_map(lambda x: x / n_valid_examples, grad)
+  (summed_loss, (n_valid_examples, new_model_state)), grad = grad_fn(params_y)
 
-  grad_norm = jnp.sqrt(
-      sum(jnp.sum(g**2) for g in jax.tree_util.tree_leaves(grad)))
+  # 4. Sync
+  loss = summed_loss / n_valid_examples
 
-  # Extract the leaves of the pytree
-  leaves = jax.tree_util.tree_leaves(grad)
-  # Count the total number of elements in all leaves
-  total_size = sum(jnp.size(leaf) for leaf in leaves)
+  grad = jax.tree_util.tree_map(lambda x: x / n_valid_examples, grad)
 
-  jax.debug.print('GRAD NORM {}', grad_norm)
-  jax.debug.print('NUM PARAMS {}', total_size)
+  # 5. Clip
+  grad_norm = jnp.sqrt(sum(jnp.sum(g**2) for g in jax.tree_util.tree_leaves(grad)))
 
   if grad_clip is not None:
     grad_scaling_factor = grad_clip / (grad_norm + _GRAD_CLIP_EPS)
     grad_scaling_factor = jax.lax.clamp(min=0.0, x=grad_scaling_factor, max=1.0)
-    grad = jax.tree_map(lambda x: x * grad_scaling_factor, grad)
+    grad = jax.tree_util.tree_map(lambda x: x * grad_scaling_factor, grad)
 
-  updates, new_optimizer_state = opt_update_fn(grad, optimizer_state,
-                                               current_param_container)
+  # 6. Update
+  updates, new_optimizer_state = opt_update_fn(
+      grad, optimizer_state, current_param_container
+  )
   updated_params = optax.apply_updates(current_param_container, updates)
+  
   return new_optimizer_state, updated_params, new_model_state, loss, grad_norm
 
 
-def update_params(workload: spec.Workload,
-                  current_param_container: spec.ParameterContainer,
-                  current_params_types: spec.ParameterTypeTree,
-                  model_state: spec.ModelAuxiliaryState,
-                  hyperparameters: spec.Hyperparameters,
-                  batch: Dict[str, spec.Tensor],
-                  loss_type: spec.LossType,
-                  optimizer_state: spec.OptimizerState,
-                  eval_results: List[Tuple[int, float]],
-                  global_step: int,
-                  rng: spec.RandomState) -> spec.UpdateReturn:
-  """Return (updated_optimizer_state, updated_params, updated_model_state)."""
-  del current_params_types
-  del loss_type
-  del eval_results
-
+def update_params(
+    workload: spec.Workload,
+    current_param_container: spec.ParameterContainer,
+    current_params_types: spec.ParameterTypeTree,
+    model_state: spec.ModelAuxiliaryState,
+    hyperparameters: spec.Hyperparameters,
+    batch: Dict[str, spec.Tensor],
+    loss_type: spec.LossType,
+    optimizer_state: spec.OptimizerState,
+    eval_results: List[Tuple[int, float]],
+    global_step: int,
+    rng: spec.RandomState,
+    **kwargs,
+) -> spec.UpdateReturn:
+  
   optimizer_state, opt_update_fn = optimizer_state
-  per_device_rngs = jax.random.split(rng, jax.local_device_count())
-  if hasattr(hyperparameters, 'label_smoothing'):
-    label_smoothing = hyperparameters.label_smoothing
-  else:
-    label_smoothing = 0.0
+  num_devices = jax.local_device_count()
+  beta1_value = 1.0 - HPARAMS['one_minus_beta1']
+
+  # 1. DEFINE THE MESH
+  # We create a mesh of all available devices named 'batch'
+  mesh = jax.sharding.Mesh(jax.devices(), ('batch',))
+  replicated_spec = NamedSharding(mesh, P())
+  data_sharding_spec = NamedSharding(mesh, P('batch'))
+  fsdp_sharding_spec = NamedSharding(mesh, P('batch'))
+
+  # 2. Dynamically choose sharding based on rank
+  def get_sharding_spec(leaf):
+    # If it's an array with dimensions, shard it.
+    if hasattr(leaf, 'ndim') and leaf.ndim >= 1:
+        return fsdp_sharding_spec
+    # If it's a scalar (e.g., step count), replicate it.
+    return replicated_spec
+
+  # 3. Create Sharding Trees matching objects
+  param_sharding_tree = jax.tree_util.tree_map(get_sharding_spec, current_param_container)
+  opt_state_sharding_tree = jax.tree_util.tree_map(get_sharding_spec, optimizer_state)
+
+  # 3. DISTRIBUTE DATA (device_put)
+  # This moves data from CPU -> GPU and shards it immediately 
+  batch = jax.device_put(batch, data_sharding_spec)
+  current_param_container = jax.device_put(current_param_container, param_sharding_tree)
+  optimizer_state = jax.device_put(optimizer_state, opt_state_sharding_tree)
+    
+  # Replicate small things
+  model_state = jax.device_put(model_state, replicated_spec)
+  rng = jax.device_put(rng, replicated_spec)
+
+  # 4. COMPILE (JIT)
+  # We compile the train_step to run on this specific Sharding setup.
+  # out_shardings defines where the results land (keeping them sharded saves memory)
+  jitted_train_step = jax.jit(
+      train_step,
+      static_argnums=(0, 1), # workload, opt_update_fn
+      in_shardings=(
+          replicated_spec,   # model_state
+          opt_state_sharding_tree, # optimizer_state (Keep Sharded!)
+          param_sharding_tree, # current_param_container (Keep Sharded!)
+          data_sharding_spec, # batch
+          replicated_spec,   # rng
+          replicated_spec,   # grad_clip
+          replicated_spec,   # label_smoothing
+          replicated_spec,   # beta1
+      ),
+      out_shardings=(
+          opt_state_sharding_tree, # new_optimizer_state
+          param_sharding_tree, # updated_params
+          replicated_spec,   # new_model_state
+          replicated_spec,   # loss
+          replicated_spec,   # grad_norm
+      )
+  )
+
   if hasattr(hyperparameters, 'grad_clip'):
     grad_clip = hyperparameters.grad_clip
   else:
     grad_clip = None
-  outputs = pmapped_train_step(workload,
-                               opt_update_fn,
-                               model_state,
-                               optimizer_state,
-                               current_param_container,
-                               batch,
-                               per_device_rngs,
-                               grad_clip,
-                               label_smoothing)
-  new_optimizer_state, new_params, new_model_state, loss, grad_norm = outputs
-
-  # Log loss, grad_norm.
-  if global_step % 100 == 0 and workload.metrics_logger is not None:
-    workload.metrics_logger.append_scalar_metrics(
-        {
-            'loss': loss[0],
-            'grad_norm': grad_norm[0],
-        }, global_step)
-  return (new_optimizer_state, opt_update_fn), new_params, new_model_state
+  
+  if hasattr(hyperparameters, 'label_smoothing'):
+    label_smoothing = hyperparameters.label_smoothing
+  else:
+    label_smoothing = 0.0
+
+  # 5. EXECUTE
+  new_optimizer_state, new_params, new_model_state, loss, grad_norm = jitted_train_step(
+      workload,
+      opt_update_fn,
+      model_state,
+      optimizer_state,
+      current_param_container,
+      batch,
+      rng,
+      grad_clip,
+      label_smoothing,
+      beta1_value
+  )
+
+  # 6. RETURN STRATEGY
+  # we keep optimizer_state sharded on GPU to save bandwidth/memory for next step.
+  return (
+      new_optimizer_state, 
+      opt_update_fn
+  ), jax.device_get(new_params), jax.device_get(new_model_state)
 
 
 def get_batch_size(workload_name):
-  # Return the global batch size.
-  if workload_name == 'criteo1tb':
-    return 262_144
-  elif workload_name == 'fastmri':
-    return 32
-  elif workload_name == 'imagenet_resnet':
-    return 1024
-  elif workload_name == 'imagenet_resnet_silu':
-    return 512
-  elif workload_name == 'imagenet_resnet_gelu':
-    return 512
-  elif workload_name == 'imagenet_vit':
-    return 1024
-  elif workload_name == 'librispeech_conformer':
-    return 256
-  elif workload_name == 'librispeech_deepspeech':
-    return 256
-  elif workload_name == 'ogbg':
-    return 512
-  elif workload_name == 'wmt':
-    return 128
-  elif workload_name == 'mnist':
-    return 16
-  else:
-    raise ValueError(f'Unsupported workload name: {workload_name}.')
-
-
-def data_selection(workload: spec.Workload,
-                   input_queue: Iterator[Dict[str, spec.Tensor]],
-                   optimizer_state: spec.OptimizerState,
-                   current_param_container: spec.ParameterContainer,
-                   model_state: spec.ModelAuxiliaryState,
-                   hyperparameters: spec.Hyperparameters,
-                   global_step: int,
-                   rng: spec.RandomState) -> Dict[str, spec.Tensor]:
-  """Select data from the infinitely repeating, pre-shuffled input queue.
-  Each element of the queue is a batch of training examples and labels.
-  """
-  del workload
-  del optimizer_state
-  del current_param_container
-  del model_state
-  del hyperparameters
-  del global_step
-  del rng
+ if workload_name == 'criteo1tb':
+   return 262_144
+ elif workload_name == 'fastmri':
+   return 16
+ elif workload_name == 'imagenet_resnet':
+   return 1024
+ elif workload_name == 'imagenet_resnet_silu':
+   return 512
+ elif workload_name == 'imagenet_resnet_gelu':
+   return 512
+ elif workload_name == 'imagenet_vit':
+   return 1024
+ elif workload_name == 'librispeech_conformer':
+   return 256
+ elif workload_name == 'librispeech_deepspeech':
+   return 128 
+ elif workload_name == 'ogbg':
+   return 512
+ elif workload_name == 'wmt':
+   return 128
+ elif workload_name == 'mnist':
+   return 16
+ else:
+   raise ValueError(f'Unsupported workload name: {workload_name}.')
+
+
+def data_selection(
+    workload: spec.Workload,
+    input_queue: Iterator[Dict[str, spec.Tensor]],
+    optimizer_state: spec.OptimizerState,
+    current_param_container: spec.ParameterContainer,
+    model_state: spec.ModelAuxiliaryState,
+    hyperparameters: spec.Hyperparameters,
+    global_step: int,
+    rng: spec.RandomState,
+) -> Dict[str, spec.Tensor]:
+  del workload, optimizer_state, current_param_container, model_state, hyperparameters, global_step, rng
   batch = next(input_queue)
-  return batch
\ No newline at end of file
+  return batch
diff --git a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
index a7000255e..4fd76cde7 100644
--- a/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
+++ b/tests/test_algorithms/schedule_free_adamw/pytorch/submission.py
@@ -124,16 +124,11 @@ def step(self, closure):
             bias_correction2 = 1 - beta2 ** (k+1)
             step_size = lr * math.sqrt(bias_correction2)
 
-            num_params = 0
-            grad_norm = 0
             for p in group['params']:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
-
-                grad_norm = grad_norm + torch.sum(grad**2)
-                num_params = num_params + torch.numel(p)
-
+                
                 state = self.state[p]
 
                 exp_avg_sq = state['exp_avg_sq']
@@ -158,10 +153,6 @@ def step(self, closure):
                 p.data.lerp_(end=z, weight=ckp1)
 
             group['k'] = k+1
-            grad_norm = torch.sqrt(grad_norm)
-            print('GRAD NORM', grad_norm)
-            print('NUM_PARAMS', num_params)
-
         return loss
 
 def init_optimizer_state(workload: spec.Workload,
@@ -284,7 +275,7 @@ def get_batch_size(workload_name):
   if workload_name == 'criteo1tb':
     return 262_144
   elif workload_name == 'fastmri':
-    return 32
+    return 16
   elif workload_name == 'imagenet_resnet':
     return 1024
   elif workload_name == 'imagenet_vit':
diff --git a/tests/test_traindiffs.py b/tests/test_traindiffs.py
index 8acfc855a..84cee4121 100644
--- a/tests/test_traindiffs.py
+++ b/tests/test_traindiffs.py
@@ -15,14 +15,12 @@
 FLAGS = flags.FLAGS
 
 WORKLOADS = [
-  'imagenet_resnet',
-  'imagenet_vit',
   'wmt',
-  'librispeech_conformer',
   'librispeech_deepspeech',
   'fastmri',
   'ogbg',
   'criteo1tb',
+  'imagenet_resnet',
 ]
 GLOBAL_BATCH_SIZE = 16
 NUM_TRAIN_STEPS = 10