Add Chapter 11 (Policy Gradient REINFORCE) with fixed advantage normalization

srikanthbaride · srikanthbaride · commit 511a6f8d64b4 · 2025-09-12T12:30:51.000-05:00
diff --git a/.github/workflows/ch11.yml b/.github/workflows/ch11.yml
@@ -0,0 +1,22 @@
+name: ch11
+on:
+  push:
+    paths: ['ch11_policy_gradient/**', '.github/workflows/ch11.yml']
+  pull_request:
+    paths: ['ch11_policy_gradient/**', '.github/workflows/ch11.yml']
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix: { python-version: ['3.9','3.10','3.11'] }
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+      - run: |
+          python -m pip install -U pip
+          pip install -r ch11_policy_gradient/requirements.txt
+      - env: { PYTHONPATH: . }
+        run: pytest -q ch11_policy_gradient/tests
diff --git a/ch11_policy_gradient/README_ch11.md b/ch11_policy_gradient/README_ch11.md
@@ -0,0 +1,7 @@
+# Chapter 11 — Policy Gradient Fundamentals (REINFORCE)
+Quickstart:
+```bash
+pip install -r ch11_policy_gradient/requirements.txt
+pytest -q ch11_policy_gradient/tests
+python -m ch11_policy_gradient.scripts.run_bandit_demo
+```
diff --git a/ch11_policy_gradient/__init__.py b/ch11_policy_gradient/__init__.py
@@ -0,0 +1,10 @@
+from .agents.reinforce import Reinforce, Trajectory
+from .policies.softmax import SoftmaxPolicy
+from .policies.gaussian import GaussianPolicy1D
+from .envs.bandit import TwoArmedBandit
+from .utils.returns import returns_to_go, standardize
+
+__all__ = [
+    "Reinforce","Trajectory","SoftmaxPolicy","GaussianPolicy1D",
+    "TwoArmedBandit","returns_to_go","standardize",
+]
diff --git a/ch11_policy_gradient/agents/reinforce.py b/ch11_policy_gradient/agents/reinforce.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+import numpy as np
+from typing import NamedTuple, Callable, Optional
+from ..utils.returns import returns_to_go, standardize
+
+class Trajectory(NamedTuple):
+    states: list
+    actions: list
+    rewards: list
+    logps: list
+
+@dataclass
+class Reinforce:
+    gamma: float = 1.0
+    alpha: float = 0.05
+    normalize_adv: bool = True
+    baseline_fn: Optional[Callable[[object], float]] = None
+    seed: int | None = None
+    def __post_init__(self):
+        self.rng = np.random.default_rng(self.seed)
+    def run_episode_discrete(self, env, policy, feature_fn: Callable[[object], np.ndarray]):
+        s = env.reset(); S,A,R,L = [],[],[],[]; done=False
+        while not done:
+            x = feature_fn(s); a = policy.sample(x)
+            logp,_ = policy.logprob_and_grad(x,a)
+            ns, r, done, _ = env.step(a)
+            S.append(s); A.append(a); R.append(r); L.append(logp); s = ns
+        return Trajectory(S,A,R,L)
+    def update_discrete(self, traj: Trajectory, policy, feature_fn: Callable[[object], np.ndarray]):
+        G = returns_to_go(traj.rewards, self.gamma)
+        if self.baseline_fn is not None:
+            b = np.array([self.baseline_fn(s) for s in traj.states], dtype=float); adv = G - b
+        else:
+            adv = G.copy()
+        if self.normalize_adv:
+            # Only standardize when there is variability; for 1-step episodes std==0 leads to zero updates.
+            if len(adv) >= 2 and np.std(adv) > 1e-8:
+                adv = standardize(adv)
+        total_grad = np.zeros_like(policy.theta)
+        for s,a,adv_t in zip(traj.states, traj.actions, adv):
+            x = feature_fn(s); _, grad = policy.logprob_and_grad(x,a)
+            total_grad += adv_t * grad
+        policy.theta += self.alpha * total_grad
+        return {"G": G, "adv": adv}
diff --git a/ch11_policy_gradient/envs/bandit.py b/ch11_policy_gradient/envs/bandit.py
@@ -0,0 +1,17 @@
+import numpy as np
+from dataclasses import dataclass
+
+@dataclass
+class TwoArmedBandit:
+    q_star: tuple[float, float] = (1.0, 1.5)
+    seed: int | None = None
+    def __post_init__(self):
+        self.rng = np.random.default_rng(self.seed)
+    @property
+    def nA(self): return 2
+    def reset(self):
+        return np.array([1.0], dtype=float)  # x(s)=1
+    def step(self, a: int):
+        assert a in (0,1)
+        r = float(self.rng.normal(self.q_star[a], 1.0))
+        return None, r, True, {}
diff --git a/ch11_policy_gradient/examples/bandit_softmax.py b/ch11_policy_gradient/examples/bandit_softmax.py
@@ -0,0 +1,25 @@
+import numpy as np
+from ..envs.bandit import TwoArmedBandit
+from ..policies.softmax import SoftmaxPolicy
+from ..agents.reinforce import Reinforce
+
+def run(episodes=200, seed=0):
+    env = TwoArmedBandit(q_star=(1.0, 1.5), seed=seed)
+    x = np.array([1.0], dtype=float)
+    policy = SoftmaxPolicy(nA=2, d=1, seed=seed)
+    algo = Reinforce(gamma=1.0, alpha=0.05, normalize_adv=True, baseline_fn=None, seed=seed)
+    probs_hist = []
+    class EPEnv:
+        def reset(self): return x
+        def step(self, a):
+            _, r, done, _ = env.step(a)
+            return None, r, True, {}
+    for _ in range(episodes):
+        traj = algo.run_episode_discrete(EPEnv(), policy, lambda s: s)
+        algo.update_discrete(traj, policy, lambda s: s)
+        probs_hist.append(policy.probs(x).copy())
+    return np.array(probs_hist), policy.theta
+
+if __name__ == "__main__":
+    probs, theta = run()
+    print("Final action probabilities:", probs[-1])
diff --git a/ch11_policy_gradient/policies/gaussian.py b/ch11_policy_gradient/policies/gaussian.py
@@ -0,0 +1,21 @@
+import numpy as np
+from dataclasses import dataclass
+
+@dataclass
+class GaussianPolicy1D:
+    mu: float = 0.0
+    log_sigma: float = 0.0
+    seed: int | None = None
+    def __post_init__(self):
+        self.rng = np.random.default_rng(self.seed)
+    @property
+    def sigma(self) -> float:
+        return float(np.exp(self.log_sigma))
+    def sample(self, _x=None) -> float:
+        return float(self.rng.normal(self.mu, self.sigma))
+    def logprob_and_grad(self, a: float, _x=None):
+        sigma2 = self.sigma ** 2
+        logp = -0.5 * ((a - self.mu) ** 2 / sigma2 + np.log(2*np.pi*sigma2))
+        dmu = (a - self.mu) / sigma2
+        dlogs = ((a - self.mu) ** 2) / sigma2 - 1.0
+        return float(logp), np.array([dmu, dlogs], dtype=float)
diff --git a/ch11_policy_gradient/policies/softmax.py b/ch11_policy_gradient/policies/softmax.py
@@ -0,0 +1,24 @@
+import numpy as np
+from dataclasses import dataclass
+
+@dataclass
+class SoftmaxPolicy:
+    nA: int
+    d: int
+    theta: np.ndarray | None = None
+    seed: int | None = None
+    def __post_init__(self):
+        if self.theta is None:
+            self.theta = np.zeros((self.nA, self.d), dtype=float)
+        self.rng = np.random.default_rng(self.seed)
+    def prefs(self, x: np.ndarray) -> np.ndarray:
+        return self.theta @ x
+    def probs(self, x: np.ndarray) -> np.ndarray:
+        h = self.prefs(x); h -= np.max(h)
+        e = np.exp(h); return e / e.sum()
+    def sample(self, x: np.ndarray) -> int:
+        p = self.probs(x); return int(self.rng.choice(self.nA, p=p))
+    def logprob_and_grad(self, x: np.ndarray, a: int):
+        p = self.probs(x); logp = float(np.log(p[a] + 1e-12))
+        grad = -np.outer(p, x); grad[a, :] += x
+        return logp, grad
diff --git a/ch11_policy_gradient/requirements.txt b/ch11_policy_gradient/requirements.txt
@@ -0,0 +1,2 @@
+numpy>=1.21
+pytest>=7.0
diff --git a/ch11_policy_gradient/scripts/run_bandit_demo.py b/ch11_policy_gradient/scripts/run_bandit_demo.py
@@ -0,0 +1,5 @@
+from ch11_policy_gradient.examples.bandit_softmax import run
+if __name__ == "__main__":
+    probs, theta = run(episodes=300, seed=42)
+    print("Last 5 probs:\n", probs[-5:])
+    print("Final probs:", probs[-1].tolist())
diff --git a/ch11_policy_gradient/tests/conftest.py b/ch11_policy_gradient/tests/conftest.py
@@ -0,0 +1,5 @@
+# Ensure repo root on sys.path so `import ch11_policy_gradient` works from any cwd
+import os, sys
+ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+if ROOT not in sys.path:
+    sys.path.insert(0, ROOT)
diff --git a/ch11_policy_gradient/tests/test_ch11_pg.py b/ch11_policy_gradient/tests/test_ch11_pg.py
@@ -0,0 +1,32 @@
+import numpy as np
+from ch11_policy_gradient.envs.bandit import TwoArmedBandit
+from ch11_policy_gradient.policies.softmax import SoftmaxPolicy
+from ch11_policy_gradient.agents.reinforce import Reinforce
+
+def test_softmax_probs_increase_for_better_arm():
+    seed = 123
+    env = TwoArmedBandit(q_star=(1.0, 1.5), seed=seed)
+    x = np.array([1.0], dtype=float)
+    policy = SoftmaxPolicy(nA=2, d=1, seed=seed)
+    algo = Reinforce(gamma=1.0, alpha=0.05, normalize_adv=True, baseline_fn=None, seed=seed)
+
+    class EPEnv:
+        def reset(self): return x
+        def step(self, a):
+            _, r, done, _ = env.step(a)
+            return None, r, True, {}
+
+    p0 = policy.probs(x)[1]
+    for _ in range(60):
+        traj = algo.run_episode_discrete(EPEnv(), policy, lambda s: s)
+        algo.update_discrete(traj, policy, lambda s: s)
+    p1 = policy.probs(x)[1]
+    assert p1 >= p0 - 1e-6
+    assert p1 > 0.55
+
+def test_logprob_gradient_shape_and_finiteness():
+    x = np.array([1.0])
+    policy = SoftmaxPolicy(nA=3, d=1, seed=0)
+    logp, grad = policy.logprob_and_grad(x, a=2)
+    assert grad.shape == (3,1)
+    assert np.isfinite(logp)
diff --git a/ch11_policy_gradient/utils/returns.py b/ch11_policy_gradient/utils/returns.py
@@ -0,0 +1,9 @@
+import numpy as np
+def returns_to_go(rewards, gamma: float) -> np.ndarray:
+    G = np.zeros(len(rewards), dtype=float); g = 0.0
+    for t in reversed(range(len(rewards))):
+        g = rewards[t] + gamma * g; G[t] = g
+    return G
+def standardize(x: np.ndarray, eps: float = 1e-8) -> np.ndarray:
+    mu, std = x.mean(), x.std()
+    return (x - mu) / (std + eps)