Skip to content

Commit 511a6f8

Browse files
Add Chapter 11 (Policy Gradient REINFORCE) with fixed advantage normalization
1 parent b41bc9b commit 511a6f8

File tree

13 files changed

+223
-0
lines changed

13 files changed

+223
-0
lines changed

.github/workflows/ch11.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: ch11
2+
on:
3+
push:
4+
paths: ['ch11_policy_gradient/**', '.github/workflows/ch11.yml']
5+
pull_request:
6+
paths: ['ch11_policy_gradient/**', '.github/workflows/ch11.yml']
7+
jobs:
8+
test:
9+
runs-on: ubuntu-latest
10+
strategy:
11+
matrix: { python-version: ['3.9','3.10','3.11'] }
12+
steps:
13+
- uses: actions/checkout@v4
14+
- uses: actions/setup-python@v5
15+
with:
16+
python-version: ${{ matrix.python-version }}
17+
cache: 'pip'
18+
- run: |
19+
python -m pip install -U pip
20+
pip install -r ch11_policy_gradient/requirements.txt
21+
- env: { PYTHONPATH: . }
22+
run: pytest -q ch11_policy_gradient/tests
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Chapter 11 — Policy Gradient Fundamentals (REINFORCE)
2+
Quickstart:
3+
```bash
4+
pip install -r ch11_policy_gradient/requirements.txt
5+
pytest -q ch11_policy_gradient/tests
6+
python -m ch11_policy_gradient.scripts.run_bandit_demo
7+
```

ch11_policy_gradient/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .agents.reinforce import Reinforce, Trajectory
2+
from .policies.softmax import SoftmaxPolicy
3+
from .policies.gaussian import GaussianPolicy1D
4+
from .envs.bandit import TwoArmedBandit
5+
from .utils.returns import returns_to_go, standardize
6+
7+
__all__ = [
8+
"Reinforce","Trajectory","SoftmaxPolicy","GaussianPolicy1D",
9+
"TwoArmedBandit","returns_to_go","standardize",
10+
]
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from dataclasses import dataclass
2+
import numpy as np
3+
from typing import NamedTuple, Callable, Optional
4+
from ..utils.returns import returns_to_go, standardize
5+
6+
class Trajectory(NamedTuple):
7+
states: list
8+
actions: list
9+
rewards: list
10+
logps: list
11+
12+
@dataclass
13+
class Reinforce:
14+
gamma: float = 1.0
15+
alpha: float = 0.05
16+
normalize_adv: bool = True
17+
baseline_fn: Optional[Callable[[object], float]] = None
18+
seed: int | None = None
19+
def __post_init__(self):
20+
self.rng = np.random.default_rng(self.seed)
21+
def run_episode_discrete(self, env, policy, feature_fn: Callable[[object], np.ndarray]):
22+
s = env.reset(); S,A,R,L = [],[],[],[]; done=False
23+
while not done:
24+
x = feature_fn(s); a = policy.sample(x)
25+
logp,_ = policy.logprob_and_grad(x,a)
26+
ns, r, done, _ = env.step(a)
27+
S.append(s); A.append(a); R.append(r); L.append(logp); s = ns
28+
return Trajectory(S,A,R,L)
29+
def update_discrete(self, traj: Trajectory, policy, feature_fn: Callable[[object], np.ndarray]):
30+
G = returns_to_go(traj.rewards, self.gamma)
31+
if self.baseline_fn is not None:
32+
b = np.array([self.baseline_fn(s) for s in traj.states], dtype=float); adv = G - b
33+
else:
34+
adv = G.copy()
35+
if self.normalize_adv:
36+
# Only standardize when there is variability; for 1-step episodes std==0 leads to zero updates.
37+
if len(adv) >= 2 and np.std(adv) > 1e-8:
38+
adv = standardize(adv)
39+
total_grad = np.zeros_like(policy.theta)
40+
for s,a,adv_t in zip(traj.states, traj.actions, adv):
41+
x = feature_fn(s); _, grad = policy.logprob_and_grad(x,a)
42+
total_grad += adv_t * grad
43+
policy.theta += self.alpha * total_grad
44+
return {"G": G, "adv": adv}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import numpy as np
2+
from dataclasses import dataclass
3+
4+
@dataclass
5+
class TwoArmedBandit:
6+
q_star: tuple[float, float] = (1.0, 1.5)
7+
seed: int | None = None
8+
def __post_init__(self):
9+
self.rng = np.random.default_rng(self.seed)
10+
@property
11+
def nA(self): return 2
12+
def reset(self):
13+
return np.array([1.0], dtype=float) # x(s)=1
14+
def step(self, a: int):
15+
assert a in (0,1)
16+
r = float(self.rng.normal(self.q_star[a], 1.0))
17+
return None, r, True, {}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import numpy as np
2+
from ..envs.bandit import TwoArmedBandit
3+
from ..policies.softmax import SoftmaxPolicy
4+
from ..agents.reinforce import Reinforce
5+
6+
def run(episodes=200, seed=0):
7+
env = TwoArmedBandit(q_star=(1.0, 1.5), seed=seed)
8+
x = np.array([1.0], dtype=float)
9+
policy = SoftmaxPolicy(nA=2, d=1, seed=seed)
10+
algo = Reinforce(gamma=1.0, alpha=0.05, normalize_adv=True, baseline_fn=None, seed=seed)
11+
probs_hist = []
12+
class EPEnv:
13+
def reset(self): return x
14+
def step(self, a):
15+
_, r, done, _ = env.step(a)
16+
return None, r, True, {}
17+
for _ in range(episodes):
18+
traj = algo.run_episode_discrete(EPEnv(), policy, lambda s: s)
19+
algo.update_discrete(traj, policy, lambda s: s)
20+
probs_hist.append(policy.probs(x).copy())
21+
return np.array(probs_hist), policy.theta
22+
23+
if __name__ == "__main__":
24+
probs, theta = run()
25+
print("Final action probabilities:", probs[-1])
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
from dataclasses import dataclass
3+
4+
@dataclass
5+
class GaussianPolicy1D:
6+
mu: float = 0.0
7+
log_sigma: float = 0.0
8+
seed: int | None = None
9+
def __post_init__(self):
10+
self.rng = np.random.default_rng(self.seed)
11+
@property
12+
def sigma(self) -> float:
13+
return float(np.exp(self.log_sigma))
14+
def sample(self, _x=None) -> float:
15+
return float(self.rng.normal(self.mu, self.sigma))
16+
def logprob_and_grad(self, a: float, _x=None):
17+
sigma2 = self.sigma ** 2
18+
logp = -0.5 * ((a - self.mu) ** 2 / sigma2 + np.log(2*np.pi*sigma2))
19+
dmu = (a - self.mu) / sigma2
20+
dlogs = ((a - self.mu) ** 2) / sigma2 - 1.0
21+
return float(logp), np.array([dmu, dlogs], dtype=float)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import numpy as np
2+
from dataclasses import dataclass
3+
4+
@dataclass
5+
class SoftmaxPolicy:
6+
nA: int
7+
d: int
8+
theta: np.ndarray | None = None
9+
seed: int | None = None
10+
def __post_init__(self):
11+
if self.theta is None:
12+
self.theta = np.zeros((self.nA, self.d), dtype=float)
13+
self.rng = np.random.default_rng(self.seed)
14+
def prefs(self, x: np.ndarray) -> np.ndarray:
15+
return self.theta @ x
16+
def probs(self, x: np.ndarray) -> np.ndarray:
17+
h = self.prefs(x); h -= np.max(h)
18+
e = np.exp(h); return e / e.sum()
19+
def sample(self, x: np.ndarray) -> int:
20+
p = self.probs(x); return int(self.rng.choice(self.nA, p=p))
21+
def logprob_and_grad(self, x: np.ndarray, a: int):
22+
p = self.probs(x); logp = float(np.log(p[a] + 1e-12))
23+
grad = -np.outer(p, x); grad[a, :] += x
24+
return logp, grad
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
numpy>=1.21
2+
pytest>=7.0
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from ch11_policy_gradient.examples.bandit_softmax import run
2+
if __name__ == "__main__":
3+
probs, theta = run(episodes=300, seed=42)
4+
print("Last 5 probs:\n", probs[-5:])
5+
print("Final probs:", probs[-1].tolist())

0 commit comments

Comments
 (0)