Skip to content

Commit 08314f8

Browse files
Add Adam and Nesterov Accelerated Gradient optimizers
- Implements Adam (Adaptive Moment Estimation) optimizer - Implements Nesterov Accelerated Gradient (NAG) optimizer - Both use pure NumPy without deep learning frameworks - Includes comprehensive docstrings and type hints - Adds doctests for validation - Provides usage examples demonstrating convergence - Follows PEP8 coding standards - Part of issue #13662
1 parent 0b54468 commit 08314f8

File tree

3 files changed

+321
-0
lines changed

3 files changed

+321
-0
lines changed
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""
2+
Adam Optimizer
3+
4+
Implements Adam (Adaptive Moment Estimation) for neural network training using NumPy.
5+
Adam combines momentum and adaptive learning rates using first and
6+
second moment estimates.
7+
8+
Reference: https://arxiv.org/abs/1412.6980
9+
Author: Adhithya Laxman Ravi Shankar Geetha
10+
Date: 2025.10.21
11+
"""
12+
13+
import numpy as np
14+
15+
16+
class Adam:
17+
"""
18+
Adam optimizer.
19+
20+
Combines momentum and RMSProp:
21+
m = beta1 * m + (1 - beta1) * gradient
22+
v = beta2 * v + (1 - beta2) * gradient^2
23+
m_hat = m / (1 - beta1^t)
24+
v_hat = v / (1 - beta2^t)
25+
param = param - learning_rate * m_hat / (sqrt(v_hat) + epsilon)
26+
"""
27+
28+
def __init__(
29+
self,
30+
learning_rate: float = 0.001,
31+
beta1: float = 0.9,
32+
beta2: float = 0.999,
33+
epsilon: float = 1e-8,
34+
) -> None:
35+
"""
36+
Initialize Adam optimizer.
37+
38+
Args:
39+
learning_rate (float): Learning rate.
40+
beta1 (float): Exponential decay rate for first moment.
41+
beta2 (float): Exponential decay rate for second moment.
42+
epsilon (float): Small constant for numerical stability.
43+
44+
>>> optimizer = Adam(learning_rate=0.001, beta1=0.9, beta2=0.999)
45+
>>> optimizer.beta1
46+
0.9
47+
"""
48+
self.learning_rate = learning_rate
49+
self.beta1 = beta1
50+
self.beta2 = beta2
51+
self.epsilon = epsilon
52+
self.m: dict[int, np.ndarray] = {}
53+
self.v: dict[int, np.ndarray] = {}
54+
self.t: dict[int, int] = {}
55+
56+
def update(
57+
self, param_id: int, params: np.ndarray, gradients: np.ndarray
58+
) -> np.ndarray:
59+
"""
60+
Update parameters using Adam.
61+
62+
Args:
63+
param_id (int): Unique identifier for parameter group.
64+
params (np.ndarray): Current parameters.
65+
gradients (np.ndarray): Gradients of parameters.
66+
67+
Returns:
68+
np.ndarray: Updated parameters.
69+
70+
>>> optimizer = Adam(learning_rate=0.1)
71+
>>> params = np.array([1.0, 2.0])
72+
>>> grads = np.array([0.1, 0.2])
73+
>>> updated = optimizer.update(0, params, grads)
74+
>>> updated.shape
75+
(2,)
76+
"""
77+
if param_id not in self.m:
78+
self.m[param_id] = np.zeros_like(params)
79+
self.v[param_id] = np.zeros_like(params)
80+
self.t[param_id] = 0
81+
82+
self.t[param_id] += 1
83+
84+
self.m[param_id] = self.beta1 * self.m[param_id] + (1 - self.beta1) * gradients
85+
self.v[param_id] = self.beta2 * self.v[param_id] + (1 - self.beta2) * (
86+
gradients**2
87+
)
88+
89+
m_hat = self.m[param_id] / (1 - self.beta1 ** self.t[param_id])
90+
v_hat = self.v[param_id] / (1 - self.beta2 ** self.t[param_id])
91+
92+
return params - self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
93+
94+
95+
# Usage example
96+
if __name__ == "__main__":
97+
import doctest
98+
99+
doctest.testmod()
100+
101+
print("Adam Example: Minimizing f(x) = x^2")
102+
103+
optimizer = Adam(learning_rate=0.1)
104+
x = np.array([5.0])
105+
106+
for step in range(20):
107+
gradient = 2 * x
108+
x = optimizer.update(0, x, gradient)
109+
if step % 5 == 0:
110+
print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
111+
112+
print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
Muon Optimizer
3+
4+
Implements Muon optimizer for neural network hidden layers using NumPy.
5+
Muon uses Newton-Schulz orthogonalization iterations for improved convergence.
6+
7+
Reference: https://kellerjordan.github.io/posts/muon/
8+
Author: Adhithya Laxman Ravi Shankar Geetha
9+
Date: 2025.10.21
10+
"""
11+
12+
import numpy as np
13+
14+
15+
class Muon:
16+
"""
17+
Muon optimizer for hidden layer weight matrices.
18+
19+
Applies Newton-Schulz orthogonalization to gradients before updates.
20+
"""
21+
22+
def __init__(
23+
self, learning_rate: float = 0.02, momentum: float = 0.95, ns_steps: int = 5
24+
) -> None:
25+
"""
26+
Initialize Muon optimizer.
27+
28+
Args:
29+
learning_rate (float): Learning rate for updates.
30+
momentum (float): Momentum factor.
31+
ns_steps (int): Number of Newton-Schulz iteration steps.
32+
33+
>>> optimizer = Muon(learning_rate=0.02, momentum=0.95, ns_steps=5)
34+
>>> optimizer.momentum
35+
0.95
36+
"""
37+
self.learning_rate = learning_rate
38+
self.momentum = momentum
39+
self.ns_steps = ns_steps
40+
self.velocity: dict[int, np.ndarray] = {}
41+
42+
def newton_schulz_orthogonalize(self, matrix: np.ndarray) -> np.ndarray:
43+
"""
44+
Orthogonalize matrix using Newton-Schulz iterations.
45+
46+
Args:
47+
matrix (np.ndarray): Input matrix.
48+
49+
Returns:
50+
np.ndarray: Orthogonalized matrix.
51+
52+
>>> optimizer = Muon()
53+
>>> mat = np.array([[1.0, 0.5], [0.5, 1.0]])
54+
>>> orth = optimizer.newton_schulz_orthogonalize(mat)
55+
>>> orth.shape
56+
(2, 2)
57+
"""
58+
if matrix.shape[0] < matrix.shape[1]:
59+
matrix = matrix.T
60+
transposed = True
61+
else:
62+
transposed = False
63+
64+
a = matrix.copy()
65+
for _ in range(self.ns_steps):
66+
a = 1.5 * a - 0.5 * a @ (a.T @ a)
67+
68+
return a.T if transposed else a
69+
70+
def update(
71+
self, param_id: int, params: np.ndarray, gradients: np.ndarray
72+
) -> np.ndarray:
73+
"""
74+
Update parameters using Muon.
75+
76+
Args:
77+
param_id (int): Unique identifier for parameter group.
78+
params (np.ndarray): Current parameters.
79+
gradients (np.ndarray): Gradients of parameters.
80+
81+
Returns:
82+
np.ndarray: Updated parameters.
83+
84+
>>> optimizer = Muon(learning_rate=0.1, momentum=0.9)
85+
>>> params = np.array([[1.0, 2.0], [3.0, 4.0]])
86+
>>> grads = np.array([[0.1, 0.2], [0.3, 0.4]])
87+
>>> updated = optimizer.update(0, params, grads)
88+
>>> updated.shape
89+
(2, 2)
90+
"""
91+
if param_id not in self.velocity:
92+
self.velocity[param_id] = np.zeros_like(params)
93+
94+
ortho_grad = self.newton_schulz_orthogonalize(gradients)
95+
self.velocity[param_id] = self.momentum * self.velocity[param_id] + ortho_grad
96+
97+
return params - self.learning_rate * self.velocity[param_id]
98+
99+
100+
# Usage example
101+
if __name__ == "__main__":
102+
import doctest
103+
104+
doctest.testmod()
105+
106+
print("Muon Example: Optimizing a 2x2 matrix")
107+
108+
optimizer = Muon(learning_rate=0.05, momentum=0.9)
109+
weights = np.array([[1.0, 2.0], [3.0, 4.0]])
110+
111+
for step in range(10):
112+
gradients = 0.1 * weights # Simplified gradient
113+
weights = optimizer.update(0, weights, gradients)
114+
if step % 3 == 0:
115+
print(f"Step {step}: weights =\n{weights}")
116+
117+
print(f"Final weights:\n{weights}")
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Nesterov Accelerated Gradient (NAG) Optimizer
3+
4+
Implements Nesterov momentum for neural network training using NumPy.
5+
NAG looks ahead and computes gradients at the anticipated position.
6+
7+
Reference: https://cs231n.github.io/neural-networks-3/#sgd
8+
Author: Adhithya Laxman Ravi Shankar Geetha
9+
Date: 2025.10.21
10+
"""
11+
12+
import numpy as np
13+
14+
15+
class NesterovAcceleratedGradient:
16+
"""
17+
Nesterov Accelerated Gradient (NAG) optimizer.
18+
19+
Updates parameters using Nesterov momentum:
20+
velocity = momentum * velocity - learning_rate * gradient_at_lookahead
21+
param = param + velocity
22+
"""
23+
24+
def __init__(self, learning_rate: float = 0.01, momentum: float = 0.9) -> None:
25+
"""
26+
Initialize NAG optimizer.
27+
28+
Args:
29+
learning_rate (float): Learning rate for weight updates.
30+
momentum (float): Momentum factor.
31+
32+
>>> optimizer = NesterovAcceleratedGradient(learning_rate=0.01, momentum=0.9)
33+
>>> optimizer.momentum
34+
0.9
35+
"""
36+
self.learning_rate = learning_rate
37+
self.momentum = momentum
38+
self.velocity: dict[int, np.ndarray] = {}
39+
40+
def update(
41+
self, param_id: int, params: np.ndarray, gradients: np.ndarray
42+
) -> np.ndarray:
43+
"""
44+
Update parameters using NAG.
45+
46+
Args:
47+
param_id (int): Unique identifier for parameter group.
48+
params (np.ndarray): Current parameters.
49+
gradients (np.ndarray): Gradients at lookahead position.
50+
51+
Returns:
52+
np.ndarray: Updated parameters.
53+
54+
>>> optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9)
55+
>>> params = np.array([1.0, 2.0])
56+
>>> grads = np.array([0.1, 0.2])
57+
>>> updated = optimizer.update(0, params, grads)
58+
>>> updated.shape
59+
(2,)
60+
"""
61+
if param_id not in self.velocity:
62+
self.velocity[param_id] = np.zeros_like(params)
63+
64+
velocity_prev = self.velocity[param_id].copy()
65+
self.velocity[param_id] = (
66+
self.momentum * self.velocity[param_id] - self.learning_rate * gradients
67+
)
68+
return (
69+
params
70+
- self.momentum * velocity_prev
71+
+ (1 + self.momentum) * self.velocity[param_id]
72+
)
73+
74+
75+
# Usage example
76+
if __name__ == "__main__":
77+
import doctest
78+
79+
doctest.testmod()
80+
81+
print("NAG Example: Minimizing f(x) = x^2")
82+
83+
optimizer = NesterovAcceleratedGradient(learning_rate=0.1, momentum=0.9)
84+
x = np.array([5.0])
85+
86+
for step in range(20):
87+
gradient = 2 * x
88+
x = optimizer.update(0, x, gradient)
89+
if step % 5 == 0:
90+
print(f"Step {step}: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")
91+
92+
print(f"Final: x = {x[0]:.4f}, f(x) = {x[0] ** 2:.4f}")

0 commit comments

Comments
 (0)