Add Adam and Nesterov Accelerated Gradient optimizers

Adhithya-Laxman · Adhithya-Laxman · commit 08314f85f04e · 2025-10-23T23:17:11.000+02:00
- Implements Adam (Adaptive Moment Estimation) optimizer - Implements Nesterov Accelerated Gradient (NAG) optimizer - Both use pure NumPy without deep learning frameworks - Includes comprehensive docstrings and type hints - Adds doctests for validation - Provides usage examples demonstrating convergence - Follows PEP8 coding standards - Part of issue #13662
diff --git a/neural_network/optimizers/adam_optimizer.py b/neural_network/optimizers/adam_optimizer.py
@@ -0,0 +1,112 @@
+"""
+Adam Optimizer
+
+Implements Adam (Adaptive Moment Estimation) for neural network training using NumPy.
+Adam combines momentum and adaptive learning rates using first and
+second moment estimates.
+
+Reference: https://arxiv.org/abs/1412.6980
+Author: Adhithya Laxman Ravi Shankar Geetha
+Date: 2025.10.21
+"""
+
+import numpy as np
+
+
+class Adam:
+    """
+    Adam optimizer.
+
+    Combines momentum and RMSProp:
+        m = beta1 * m + (1 - beta1) * gradient
+        v = beta2 * v + (1 - beta2) * gradient^2
+        m_hat = m / (1 - beta1^t)
+        v_hat = v / (1 - beta2^t)
+        param = param - learning_rate * m_hat / (sqrt(v_hat) + epsilon)
+    """
+
+    def __init__(
+        self,
+        learning_rate: float = 0.001,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        epsilon: float = 1e-8,
+    ) -> None:
+        """
+        Initialize Adam optimizer.
+
+        Args:
+            learning_rate (float): Learning rate.
+            beta1 (float): Exponential decay rate for first moment.
+            beta2 (float): Exponential decay rate for second moment.
+            epsilon (float): Small constant for numerical stability.
+
+        >>> optimizer = Adam(learning_rate=0.001, beta1=0.9, beta2=0.999)
+        >>> optimizer.beta1
+        0.9
+        """
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.m: dict[int, np.ndarray] = {}
+        self.v: dict[int, np.ndarray] = {}
+        self.t: dict[int, int] = {}
+
+    def update(
+        self, param_id: int, params: np.ndarray, gradients: np.ndarray
+    ) -> np.ndarray:
+        """
+        Update parameters using Adam.
+
+        Args:
+            param_id (int): Unique identifier for parameter group.
+            params (np.ndarray): Current parameters.
+            gradients (np.ndarray): Gradients of parameters.
+
+        Returns:
+            np.ndarray: Updated parameters.
+
+        >>> optimizer = Adam(learning_rate=0.1)
+        >>> params = np.array([1.0, 2.0])
+        >>> grads = np.array([0.1, 0.2])
+        >>> updated = optimizer.update(0, params, grads)
+        >>> updated.shape
+        (2,)
+        """
+        if param_id not in self.m:
+            self.m[param_id] = np.zeros_like(params)
+            self.v[param_id] = np.zeros_like(params)
+            self.t[param_id] = 0
+
+        self.t[param_id] += 1
+
+        self.m[param_id] = self.beta1 * self.m[param_id] + (1 - self.beta1) * gradients
+        self.v[param_id] = self.beta2 * self.v[param_id] + (1 - self.beta2) * (
+            gradients**2
+        )
+
+        m_hat = self.m[param_id] / (1 - self.beta1 ** self.t[param_id])
+        v_hat = self.v[param_id] / (1 - self.beta2 ** self.t[param_id])
+
+        return params - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
+
+
+# Usage example
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    print("Adam Example: Minimizing f(x) = x^2")
+
+    optimizer = Adam(learning_rate=0.1)
+    x = np.array([5.0])
+
+    for step in range(20):
+        gradient = 2 * x
+        x = optimizer.update(0, x, gradient)
+        if step % 5 == 0:
+            print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
+
+    print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
diff --git a/neural_network/optimizers/muon_optimizer.py b/neural_network/optimizers/muon_optimizer.py
@@ -0,0 +1,117 @@
+"""
+Muon Optimizer
+
+Implements Muon optimizer for neural network hidden layers using NumPy.
+Muon uses Newton-Schulz orthogonalization iterations for improved convergence.
+
+Reference: https://kellerjordan.github.io/posts/muon/
+Author: Adhithya Laxman Ravi Shankar Geetha
+Date: 2025.10.21
+"""
+
+import numpy as np
+
+
+class Muon:
+    """
+    Muon optimizer for hidden layer weight matrices.
+
+    Applies Newton-Schulz orthogonalization to gradients before updates.
+    """
+
+    def __init__(
+        self, learning_rate: float = 0.02, momentum: float = 0.95, ns_steps: int = 5
+    ) -> None:
+        """
+        Initialize Muon optimizer.
+
+        Args:
+            learning_rate (float): Learning rate for updates.
+            momentum (float): Momentum factor.
+            ns_steps (int): Number of Newton-Schulz iteration steps.
+
+        >>> optimizer = Muon(learning_rate=0.02, momentum=0.95, ns_steps=5)
+        >>> optimizer.momentum
+        0.95
+        """
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.ns_steps = ns_steps
+        self.velocity: dict[int, np.ndarray] = {}
+
+    def newton_schulz_orthogonalize(self, matrix: np.ndarray) -> np.ndarray:
+        """
+        Orthogonalize matrix using Newton-Schulz iterations.
+
+        Args:
+            matrix (np.ndarray): Input matrix.
+
+        Returns:
+            np.ndarray: Orthogonalized matrix.
+
+        >>> optimizer = Muon()
+        >>> mat = np.array([[1.0, 0.5], [0.5, 1.0]])
+        >>> orth = optimizer.newton_schulz_orthogonalize(mat)
+        >>> orth.shape
+        (2, 2)
+        """
+        if matrix.shape[0] < matrix.shape[1]:
+            matrix = matrix.T
+            transposed = True
+        else:
+            transposed = False
+
+        a = matrix.copy()
+        for _ in range(self.ns_steps):
+            a = 1.5 * a - 0.5 * a @ (a.T @ a)
+
+        return a.T if transposed else a
+
+    def update(
+        self, param_id: int, params: np.ndarray, gradients: np.ndarray
+    ) -> np.ndarray:
+        """
+        Update parameters using Muon.
+
+        Args:
+            param_id (int): Unique identifier for parameter group.
+            params (np.ndarray): Current parameters.
+            gradients (np.ndarray): Gradients of parameters.
+
+        Returns:
+            np.ndarray: Updated parameters.
+
+        >>> optimizer = Muon(learning_rate=0.1, momentum=0.9)
+        >>> params = np.array([[1.0, 2.0], [3.0, 4.0]])
+        >>> grads = np.array([[0.1, 0.2], [0.3, 0.4]])
+        >>> updated = optimizer.update(0, params, grads)
+        >>> updated.shape
+        (2, 2)
+        """
+        if param_id not in self.velocity:
+            self.velocity[param_id] = np.zeros_like(params)
+
+        ortho_grad = self.newton_schulz_orthogonalize(gradients)
+        self.velocity[param_id] = self.momentum * self.velocity[param_id] + ortho_grad
+
+        return params - self.learning_rate * self.velocity[param_id]
+
+
+# Usage example
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    print("Muon Example: Optimizing a 2x2 matrix")
+
+    optimizer = Muon(learning_rate=0.05, momentum=0.9)
+    weights = np.array([[1.0, 2.0], [3.0, 4.0]])
+
+    for step in range(10):
+        gradients = 0.1 * weights  # Simplified gradient
+        weights = optimizer.update(0, weights, gradients)
+        if step % 3 == 0:
+            print(f"Step {step}: weights =\n{weights}")
+
+    print(f"Final weights:\n{weights}")
diff --git a/neural_network/optimizers/nesterov_accelerated_sgd.py b/neural_network/optimizers/nesterov_accelerated_sgd.py
@@ -0,0 +1,92 @@
+"""
+Nesterov Accelerated Gradient (NAG) Optimizer
+
+Implements Nesterov momentum for neural network training using NumPy.
+NAG looks ahead and computes gradients at the anticipated position.
+
+Reference: https://cs231n.github.io/neural-networks-3/#sgd
+Author: Adhithya Laxman Ravi Shankar Geetha
+Date: 2025.10.21
+"""
+
+import numpy as np
+
+
+class NesterovAcceleratedGradient:
+    """
+    Nesterov Accelerated Gradient (NAG) optimizer.
+
+    Updates parameters using Nesterov momentum:
+        velocity = momentum * velocity - learning_rate * gradient_at_lookahead
+        param = param + velocity
+    """
+
+    def __init__(self, learning_rate: float = 0.01, momentum: float = 0.9) -> None:
+        """
+        Initialize NAG optimizer.
+
+        Args:
+            learning_rate (float): Learning rate for weight updates.
+            momentum (float): Momentum factor.
+
+        >>> optimizer = NesterovAcceleratedGradient(learning_rate=0.01, momentum=0.9)
+        >>> optimizer.momentum
+        0.9
+        """
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.velocity: dict[int, np.ndarray] = {}
+
+    def update(
+        self, param_id: int, params: np.ndarray, gradients: np.ndarray
+    ) -> np.ndarray:
+        """
+        Update parameters using NAG.
+
+        Args:
+            param_id (int): Unique identifier for parameter group.
+            params (np.ndarray): Current parameters.
+            gradients (np.ndarray): Gradients at lookahead position.
+
+        Returns:
+            np.ndarray: Updated parameters.
+
+        >>> optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9)
+        >>> params = np.array([1.0, 2.0])
+        >>> grads = np.array([0.1, 0.2])
+        >>> updated = optimizer.update(0, params, grads)
+        >>> updated.shape
+        (2,)
+        """
+        if param_id not in self.velocity:
+            self.velocity[param_id] = np.zeros_like(params)
+
+        velocity_prev = self.velocity[param_id].copy()
+        self.velocity[param_id] = (
+            self.momentum * self.velocity[param_id] - self.learning_rate * gradients
+        )
+        return (
+            params
+            - self.momentum * velocity_prev
+            + (1 + self.momentum) * self.velocity[param_id]
+        )
+
+
+# Usage example
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    print("NAG Example: Minimizing f(x) = x^2")
+
+    optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9)
+    x = np.array([5.0])
+
+    for step in range(20):
+        gradient = 2 * x
+        x = optimizer.update(0, x, gradient)
+        if step % 5 == 0:
+            print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
+
+    print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")