|
| 1 | +from dataclasses import dataclass |
| 2 | +import numpy as np |
| 3 | +from typing import NamedTuple, Callable, Optional |
| 4 | +from ..utils.returns import returns_to_go, standardize |
| 5 | + |
| 6 | +class Trajectory(NamedTuple): |
| 7 | + states: list |
| 8 | + actions: list |
| 9 | + rewards: list |
| 10 | + logps: list |
| 11 | + |
| 12 | +@dataclass |
| 13 | +class Reinforce: |
| 14 | + gamma: float = 1.0 |
| 15 | + alpha: float = 0.05 |
| 16 | + normalize_adv: bool = True |
| 17 | + baseline_fn: Optional[Callable[[object], float]] = None |
| 18 | + seed: int | None = None |
| 19 | + def __post_init__(self): |
| 20 | + self.rng = np.random.default_rng(self.seed) |
| 21 | + def run_episode_discrete(self, env, policy, feature_fn: Callable[[object], np.ndarray]): |
| 22 | + s = env.reset(); S,A,R,L = [],[],[],[]; done=False |
| 23 | + while not done: |
| 24 | + x = feature_fn(s); a = policy.sample(x) |
| 25 | + logp,_ = policy.logprob_and_grad(x,a) |
| 26 | + ns, r, done, _ = env.step(a) |
| 27 | + S.append(s); A.append(a); R.append(r); L.append(logp); s = ns |
| 28 | + return Trajectory(S,A,R,L) |
| 29 | + def update_discrete(self, traj: Trajectory, policy, feature_fn: Callable[[object], np.ndarray]): |
| 30 | + G = returns_to_go(traj.rewards, self.gamma) |
| 31 | + if self.baseline_fn is not None: |
| 32 | + b = np.array([self.baseline_fn(s) for s in traj.states], dtype=float); adv = G - b |
| 33 | + else: |
| 34 | + adv = G.copy() |
| 35 | + if self.normalize_adv: |
| 36 | + # Only standardize when there is variability; for 1-step episodes std==0 leads to zero updates. |
| 37 | + if len(adv) >= 2 and np.std(adv) > 1e-8: |
| 38 | + adv = standardize(adv) |
| 39 | + total_grad = np.zeros_like(policy.theta) |
| 40 | + for s,a,adv_t in zip(traj.states, traj.actions, adv): |
| 41 | + x = feature_fn(s); _, grad = policy.logprob_and_grad(x,a) |
| 42 | + total_grad += adv_t * grad |
| 43 | + policy.theta += self.alpha * total_grad |
| 44 | + return {"G": G, "adv": adv} |
0 commit comments