from __future__ import annotations

import math
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm

torch.manual_seed(0)
np.random.seed(0)
DEVICE = torch.device("cpu")
torch.set_default_dtype(torch.float32)
print("torch:", torch.__version__)

torch: 2.11.0

def plot_samples(samples, ax=None, *, title=None, color="C0", s=6, alpha=0.4,
                 xlim=(-5, 5), ylim=(-5, 5), c=None, cmap=None, label=None):
    """Scatter-plot a tensor of 2D points.

    Pass ``c`` (per-point values) and ``cmap`` to color points by a category,
    otherwise the single ``color`` is used.
    """
    if ax is None:
        _, ax = plt.subplots(figsize=(4, 4))
    pts = samples.detach().cpu().numpy()
    if c is not None:
        ax.scatter(pts[:, 0], pts[:, 1], s=s, alpha=alpha, c=c, cmap=cmap, label=label)
    else:
        ax.scatter(pts[:, 0], pts[:, 1], s=s, alpha=alpha, color=color, label=label)
    ax.set_xlim(xlim); ax.set_ylim(ylim); ax.set_aspect("equal")
    if title:
        ax.set_title(title)
    return ax


def plot_density(density_fn, ax=None, *, title=None, n_grid=80,
                 xlim=(-5, 5), ylim=(-5, 5), cmap="viridis"):
    """Heatmap of `density_fn`, a callable mapping [N, 2] points to [N] scalars."""
    if ax is None:
        _, ax = plt.subplots(figsize=(4, 4))
    gx = torch.linspace(xlim[0], xlim[1], n_grid)
    gy = torch.linspace(ylim[0], ylim[1], n_grid)
    X, Y = torch.meshgrid(gx, gy, indexing="ij")
    pts = torch.stack([X.flatten(), Y.flatten()], dim=-1)
    Z = density_fn(pts).detach().cpu().reshape(n_grid, n_grid).numpy()
    ax.imshow(Z.T, origin="lower", extent=(*xlim, *ylim), cmap=cmap)
    ax.set_aspect("equal")
    if title:
        ax.set_title(title)
    return ax


def kde_density(grid_pts, samples, h=0.15):
    """Gaussian KDE (unnormalized values; we re-normalize when comparing)."""
    d2 = ((grid_pts[:, None, :] - samples[None, :, :]) ** 2).sum(-1)
    p = torch.exp(-0.5 * d2 / h**2).mean(-1)
    return p

def sample_ring(n: int, modes: int = 8, radius: float = 4.0, sigma: float = 0.15,
                return_labels: bool = False):
    """Sample n points from an 8-mode Gaussian ring (the target distribution).

    Returns a [n, 2] tensor, or (samples, mode_idx) if ``return_labels=True``.
    """
    mode_idx = torch.randint(0, modes, (n,))
    angles = (2 * math.pi / modes) * mode_idx
    centers = torch.stack(
        [radius * torch.cos(angles), radius * torch.sin(angles)], dim=-1
    )
    samples = centers + sigma * torch.randn(n, 2)
    if return_labels:
        return samples, mode_idx
    return samples


def sample_source(n: int) -> torch.Tensor:
    """Sample n points from the source distribution, a 2D standard Gaussian."""
    return torch.randn(n, 2)


# Visualize source and target side by side so we know what we're transporting.
# We color the target by mode index so the 8 clusters are visually distinct.
ring_samples, ring_labels = sample_ring(3000, return_labels=True)
fig, axes = plt.subplots(1, 2, figsize=(9, 4.5))
plot_samples(sample_source(3000), axes[0],
             title="source: standard Gaussian N(0, I)", color="0.25")
plot_samples(ring_samples, axes[1],
             title="target: 8-mode Gaussian ring",
             c=ring_labels.numpy(), cmap="tab10")
for ax in axes:
    ax.set_xlim(-5, 5); ax.set_ylim(-5, 5)
plt.tight_layout(); plt.show()

class VelocityNet(nn.Module):
    """v_theta(x, t) for 2D flow matching.

    Inputs:
        x: [B, 2] spatial location.
        t: [B] time in [0, 1].
    Output:
        [B, 2] predicted velocity.
    """

    def __init__(self, hidden: int = 128, n_freqs: int = 8):
        super().__init__()
        # Fixed sinusoidal time features: sin/cos of t * 2^k * pi for k=0..n_freqs-1.
        self.register_buffer("freqs", 2 ** torch.arange(n_freqs).float() * math.pi)
        in_dim = 2 + 2 * n_freqs
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden), nn.SiLU(),
            nn.Linear(hidden, hidden), nn.SiLU(),
            nn.Linear(hidden, hidden), nn.SiLU(),
            nn.Linear(hidden, 2),
        )

    def time_emb(self, t: torch.Tensor) -> torch.Tensor:
        ang = t[:, None] * self.freqs[None, :]              # [B, n_freqs]
        return torch.cat([torch.sin(ang), torch.cos(ang)], dim=-1)  # [B, 2*n_freqs]

    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
        return self.net(torch.cat([x, self.time_emb(t)], dim=-1))


def num_params(m: nn.Module) -> int:
    return sum(p.numel() for p in m.parameters())


_demo_net = VelocityNet()
print(f"VelocityNet has {num_params(_demo_net):,} parameters.")

VelocityNet has 35,714 parameters.

def pretrain_flow_matching(
    model: VelocityNet,
    *,
    steps: int = 5000,
    batch_size: int = 256,
    lr: float = 1e-3,
) -> list[float]:
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    losses: list[float] = []
    pbar = tqdm(range(steps), desc="pretrain", mininterval=2.0)
    for s in pbar:
        x0 = sample_ring(batch_size)
        eps = torch.randn_like(x0)
        t = torch.rand(batch_size)
        xt = (1 - t[:, None]) * x0 + t[:, None] * eps
        target = eps - x0
        loss = ((model(xt, t) - target) ** 2).mean()
        opt.zero_grad(); loss.backward(); opt.step()
        losses.append(loss.item())
        if s % 500 == 0:
            pbar.set_postfix(loss=f"{loss.item():.3f}")
    return losses

v_ref = VelocityNet()
pretrain_losses = pretrain_flow_matching(v_ref, steps=5000)
v_ref.eval()
for p in v_ref.parameters():
    p.requires_grad_(False)  # freeze it -- this is our reference

pretrain: 100%|██████████| 5000/5000 [00:04<00:00, 1180.43it/s, loss=3.420]

@torch.no_grad()
def euler_sample(model: VelocityNet, n: int, n_steps: int = 50) -> torch.Tensor:
    """Generate n samples by integrating v_theta backward from t=1 to t=0.

    Starts from n source samples ~ N(0, I) and pushes them toward the target.
    """
    x = sample_source(n)
    dt = 1.0 / n_steps
    for k in range(n_steps):
        t = 1.0 - k * dt
        x = x - model(x, torch.full((n,), t)) * dt
    return x


# Visualize what the pretrained model has learned: target distribution, and source -> learned target.
ref_samples = euler_sample(v_ref, 3000)
src_samples = sample_source(3000)
gt_samples, gt_labels = sample_ring(3000, return_labels=True)

fig, axes = plt.subplots(1, 2, figsize=(10, 4.5))
plot_samples(gt_samples, axes[0], title="p_data (ground truth)",
             c=gt_labels.numpy(), cmap="tab10")
plot_samples(src_samples, axes[1], color="navy", label="source (t=1)")
plot_samples(ref_samples, axes[1], color="teal", label="target (t=0)",
             title="source N(0, I)  ->  target")
axes[1].legend(loc="upper right", fontsize=8, frameon=False, markerscale=2)
plt.tight_layout(); plt.show()

def reward(x: torch.Tensor, center=(4.0, 0.0), scale: float = 2.5) -> torch.Tensor:
    """Smooth, bounded-in-[0,1] reward favoring points near `center`."""
    c = torch.as_tensor(center, dtype=x.dtype, device=x.device)
    return torch.exp(-0.5 * ((x - c) ** 2).sum(-1) / scale**2)


BETA = 5.0  # reward scaling we will use throughout (a single dial)

# A big sample bag from the reference model to build a smooth KDE of p_ref.
ref_big = euler_sample(v_ref, 5000)


def p_ref_density(grid_pts):
    p = kde_density(grid_pts, ref_big, h=0.15)
    return p / (p.sum() + 1e-12)


def p_target_density(grid_pts, beta=BETA):
    p = kde_density(grid_pts, ref_big, h=0.15) * torch.exp(beta * reward(grid_pts))
    return p / (p.sum() + 1e-12)


fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))
plot_density(p_ref_density, axes[0], title="p_ref (pretrained)")
plot_density(reward,        axes[1], title=f"reward r(x)  (we use beta={BETA})")
plot_density(p_target_density, axes[2],
             title=fr"p_target $\propto$ p_ref $\cdot$ exp({BETA} r)")
plt.tight_layout(); plt.show()

@dataclass
class RAMConfig:
    outer_steps: int = 600        # number of optimizer steps
    group_size: int = 32          # endpoints per step (G)
    k_targets: int = 4            # (t, eps) draws per endpoint (K)
    sample_steps: int = 25        # Euler steps for endpoint sampling
    lr: float = 1e-4
    beta: float = 5.0             # reward scale (single dial)
    use_advantage: bool = False   # subtract group-mean baseline
    scale_advantage: bool = False # also divide by group std
    log_every: int = 50


def train_ram(
    model: VelocityNet,
    model_ref: VelocityNet,
    reward_fn,
    cfg: RAMConfig,
) -> dict:
    """Train `model` with RAM. `model_ref` is the frozen pretrained reference.

    Returns a dict with training-curve arrays.
    """
    opt = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    mean_reward_hist, loss_hist = [], []
    pbar = tqdm(range(cfg.outer_steps), desc=f"RAM (beta={cfg.beta})", mininterval=2.0)
    for step in pbar:

        # 1) Sample G endpoints on-policy.
        with torch.no_grad():
            x0 = euler_sample(model, cfg.group_size, n_steps=cfg.sample_steps)

        # 2) Score them with the reward.
        raw_reward = reward_fn(x0)                  # [G]; the actual r(x_0)
        mean_reward_hist.append(raw_reward.mean().item())

        # 3) Build the "signal" that enters the loss.
        if cfg.use_advantage:
            advantage = raw_reward - raw_reward.mean()
            if cfg.scale_advantage:
                advantage = advantage / (raw_reward.std(correction=0) + 1e-4)
            signal = cfg.beta * advantage
        else:
            signal = cfg.beta * raw_reward          # the raw-reward loss

        # 4) Reuse each endpoint for K (t, eps) draws.
        x0_rep     = x0.repeat_interleave(cfg.k_targets, dim=0)        # [G*K, 2]
        signal_rep = signal.repeat_interleave(cfg.k_targets, dim=0)    # [G*K]
        B = x0_rep.shape[0]
        eps = torch.randn_like(x0_rep)
        t   = torch.rand(B)
        xt  = (1 - t[:, None]) * x0_rep + t[:, None] * eps

        # 5) Build the RAM target (stop-gradient on everything inside).
        with torch.no_grad():
            v_ref_xt    = model_ref(xt, t)
            v_theta_sg  = model(xt, t)              # current model, no grad
            pretrain_target = eps - x0_rep
            target = v_ref_xt + signal_rep[:, None] * (pretrain_target - v_theta_sg)

        # 6) MSE between the model's prediction and the target.
        v_pred = model(xt, t)
        loss = ((v_pred - target) ** 2).mean()
        opt.zero_grad(); loss.backward(); opt.step()
        loss_hist.append(loss.item())

        if step % cfg.log_every == 0:
            pbar.set_postfix(loss=f"{loss.item():.3f}",
                             mean_r=f"{mean_reward_hist[-1]:.3f}")
    return {"mean_reward": mean_reward_hist, "loss": loss_hist}

# Initialize the model as a copy of the pretrained reference.
v_theta = VelocityNet()
v_theta.load_state_dict(v_ref.state_dict())

history = train_ram(v_theta, v_ref, reward, RAMConfig(
    outer_steps=600,
    group_size=32,
    k_targets=4,
    beta=BETA,
    lr=1e-4,
))

RAM (beta=5.0): 100%|██████████| 600/600 [00:01<00:00, 437.55it/s, loss=17.484, mean_r=0.689]

fig, axes = plt.subplots(1, 2, figsize=(11, 4))
axes[0].plot(history["mean_reward"])
axes[0].set_xlabel("outer step"); axes[0].set_ylabel("mean reward over group")
axes[0].set_title("RAM training reward")

axes[1].plot(history["loss"])
axes[1].set_xlabel("outer step"); axes[1].set_ylabel("RAM loss")
axes[1].set_title("RAM loss (note: grows with reward; not a failure mode)")
axes[1].set_yscale("log")
plt.tight_layout(); plt.show()

v_theta.eval()
ram_samples = euler_sample(v_theta, 3000)
ram_reward = reward(ram_samples).mean().item()
ref_reward = reward(ref_samples).mean().item()
print(f"mean reward — pretrained: {ref_reward:.3f}  |  RAM-trained: {ram_reward:.3f}  "
      f"(x{ram_reward/ref_reward:.1f})")


def p_ram_density(grid_pts):
    p = kde_density(grid_pts, ram_samples, h=0.15)
    return p / (p.sum() + 1e-12)


fig, axes = plt.subplots(1, 4, figsize=(18, 4.5))
plot_density(p_ref_density,    axes[0], title="p_ref (pretrained)")
plot_density(reward,           axes[1], title="reward r(x)")
plot_density(p_target_density, axes[2], title=fr"analytic $p_{{\rm target}} \propto p_{{\rm ref}} \cdot e^{{{BETA} r}}$")
plot_density(p_ram_density,    axes[3], title=f"RAM-trained  (mean r = {ram_reward:.2f})")
plt.tight_layout(); plt.show()

mean reward — pretrained: 0.244  |  RAM-trained: 0.673  (x2.8)

v_theta_zero = VelocityNet()
v_theta_zero.load_state_dict(v_ref.state_dict())
hist_zero = train_ram(v_theta_zero, v_ref, reward, RAMConfig(
    outer_steps=200, beta=0.0, lr=1e-4, log_every=200,
))
zero_samples = euler_sample(v_theta_zero, 3000)

print(f"max loss observed (should be ~0):     {max(hist_zero['loss']):.2e}")
print(f"mean reward of beta=0 RAM samples:    {reward(zero_samples).mean():.4f}")
print(f"mean reward of pretrained samples:    {reward(ref_samples).mean():.4f}")

fig, axes = plt.subplots(1, 2, figsize=(9, 4))
plot_samples(ref_samples,  axes[0], title="pretrained",     color="teal")
plot_samples(zero_samples, axes[1], title="beta=0 RAM (should match)", color="C2")
plt.tight_layout(); plt.show()

RAM (beta=0.0): 100%|██████████| 200/200 [00:00<00:00, 435.23it/s, loss=0.000, mean_r=0.242]

max loss observed (should be ~0):     0.00e+00
mean reward of beta=0 RAM samples:    0.2420
mean reward of pretrained samples:    0.2436

sweep_results = []
for beta in [2.0, 5.0, 8.0]:
    torch.manual_seed(7)
    v_sw = VelocityNet()
    v_sw.load_state_dict(v_ref.state_dict())
    h = train_ram(v_sw, v_ref, reward, RAMConfig(
        outer_steps=400, beta=beta, lr=1e-4, log_every=400,
    ))
    samp = euler_sample(v_sw, 3000)
    sweep_results.append((beta, samp, reward(samp).mean().item()))

fig, axes = plt.subplots(2, 3, figsize=(13, 8))
for col, (beta, samp, mean_r) in enumerate(sweep_results):
    # Top row: analytic target.
    def make_target(b=beta):
        def f(grid):
            p = kde_density(grid, ref_big, h=0.15) * torch.exp(b * reward(grid))
            return p / (p.sum() + 1e-12)
        return f
    plot_density(make_target(beta), axes[0, col],
                 title=fr"analytic $p_{{\rm target}}$, $\beta = {beta}$")
    # Bottom row: KDE density of the RAM-trained samples (same form as top
    # row, so the tilting is directly readable from peak sharpness).
    def make_ram_density(s=samp):
        def f(grid):
            p = kde_density(grid, s, h=0.15)
            return p / (p.sum() + 1e-12)
        return f
    plot_density(make_ram_density(samp), axes[1, col],
                 title=fr"RAM-trained, $\beta = {beta}$, mean $r$ = {mean_r:.2f}")
plt.tight_layout(); plt.show()

RAM (beta=2.0): 100%|██████████| 400/400 [00:00<00:00, 439.91it/s, loss=2.047, mean_r=0.288]

RAM (beta=5.0): 100%|██████████| 400/400 [00:00<00:00, 436.08it/s, loss=12.791, mean_r=0.288]

RAM (beta=8.0): 100%|██████████| 400/400 [00:00<00:00, 438.79it/s, loss=32.745, mean_r=0.288]

k_results = {}
for K in [1, 4, 16]:
    torch.manual_seed(11)
    v_k = VelocityNet()
    v_k.load_state_dict(v_ref.state_dict())
    h = train_ram(v_k, v_ref, reward, RAMConfig(
        outer_steps=300, group_size=32, k_targets=K, beta=BETA, lr=1e-4,
        log_every=300,
    ))
    k_results[K] = h["mean_reward"]

plt.figure(figsize=(8, 4))
for K, mr in k_results.items():
    # Smooth a little for legibility.
    mr_smooth = np.convolve(mr, np.ones(20)/20, mode="valid")
    plt.plot(mr_smooth, label=f"K = {K}")
plt.xlabel("outer step (each step = one ODE rollout)")
plt.ylabel("mean reward over group (smoothed)")
plt.title("Targets per endpoint K: more reuse = more signal per sampling cost")
plt.legend()
plt.tight_layout(); plt.show()

RAM (beta=5.0): 100%|██████████| 300/300 [00:00<00:00, 487.26it/s, loss=11.599, mean_r=0.227]

RAM (beta=5.0): 100%|██████████| 300/300 [00:00<00:00, 435.67it/s, loss=28.244, mean_r=0.227]

RAM (beta=5.0): 100%|██████████| 300/300 [00:01<00:00, 257.27it/s, loss=15.991, mean_r=0.227]

adv_variants = [
    ("raw $\\beta r$",            False, False),
    ("group-mean centered",       True,  False),
    ("group-mean + std-scaled",   True,  True),
]
adv_results = {}
for tag, use_adv, scale_adv in adv_variants:
    torch.manual_seed(21)
    v_a = VelocityNet()
    v_a.load_state_dict(v_ref.state_dict())
    h = train_ram(v_a, v_ref, reward, RAMConfig(
        outer_steps=400, group_size=32, k_targets=4,
        beta=BETA, lr=1e-4, log_every=400,
        use_advantage=use_adv, scale_advantage=scale_adv,
    ))
    adv_results[tag] = h["mean_reward"]

plt.figure(figsize=(8, 4))
for tag, mr in adv_results.items():
    mr_smooth = np.convolve(mr, np.ones(20)/20, mode="valid")
    plt.plot(mr_smooth, label=tag)
plt.xlabel("outer step")
plt.ylabel("mean reward over group (smoothed)")
plt.title(f"Reward signals at $\\beta = {BETA}$")
plt.legend()
plt.tight_layout(); plt.show()

RAM (beta=5.0): 100%|██████████| 400/400 [00:00<00:00, 441.91it/s, loss=8.657, mean_r=0.206]

RAM (beta=5.0): 100%|██████████| 400/400 [00:00<00:00, 441.90it/s, loss=6.677, mean_r=0.206]

RAM (beta=5.0): 100%|██████████| 400/400 [00:00<00:00, 436.31it/s, loss=64.702, mean_r=0.206]

# Same setup as §6 (single reward at (4, 0), beta=5, G=32, K=4) but with
# std-scaled advantage and pushed to 1000 outer steps. Snapshot the
# 3000-sample reward at intermediate steps.
torch.manual_seed(3)
v_long = VelocityNet()
v_long.load_state_dict(v_ref.state_dict())

opt_long = torch.optim.Adam(v_long.parameters(), lr=1e-4)
mean_r_hist = []
snap_steps = [100, 200, 300, 400, 500, 700, 900, 1000]
snap_rewards = []
for step in range(1, 1001):
    with torch.no_grad():
        x0 = euler_sample(v_long, 32)
    r = reward(x0)
    adv = (r - r.mean()) / r.std(correction=0).clamp_min(1e-3)  # the §7.4 variant
    x0_rep = x0.repeat_interleave(4, 0)
    eps = torch.randn_like(x0_rep)
    t = torch.rand(x0_rep.shape[0])
    xt = (1 - t[:, None]) * x0_rep + t[:, None] * eps
    with torch.no_grad():
        v_ref_xt = v_ref(xt, t)
        v_sg = v_long(xt, t)
        pretrain_target = eps - x0_rep
    sig = (BETA * adv).repeat_interleave(4, 0)
    target = v_ref_xt + sig[:, None] * (pretrain_target - v_sg)
    loss = ((v_long(xt, t) - target) ** 2).mean()
    opt_long.zero_grad(); loss.backward(); opt_long.step()
    mean_r_hist.append(r.mean().item())
    if step in snap_steps:
        with torch.no_grad():
            snap_rewards.append((step, reward(euler_sample(v_long, 3000)).mean().item()))

print(f"{'step':>5} | reward on 3000 ODE samples")
for s, r in snap_rewards:
    print(f"{s:5d} | {r:.3f}")

 step | reward on 3000 ODE samples
  100 | 0.404
  200 | 0.591
  300 | 0.712
  400 | 0.677
  500 | 0.666
  700 | 0.469
  900 | 0.026
 1000 | 0.006

mr = np.asarray(mean_r_hist)
W = 30
mr_smooth = np.convolve(mr, np.ones(W) / W, mode="valid")

fig, ax = plt.subplots(figsize=(8.5, 4.2))
ax.plot(mr, color="C0", alpha=0.18, lw=0.7, label="per-batch reward")
ax.plot(np.arange(W - 1, len(mr)), mr_smooth,
        color="C0", lw=2.2, label=f"rolling mean ({W})")
xs, ys = zip(*snap_rewards)
ax.plot(xs, ys, "o", color="C3", markersize=7,
        label="reward on 3000 ODE samples")
ax.axvspan(250, 500, color="green",  alpha=0.10, label="safe plateau")
ax.axvspan(700, 1000, color="red",   alpha=0.10, label="collapsing")
ax.set_xlabel("outer step")
ax.set_ylabel("mean reward")
ax.set_title(r"RAM with std-scaled advantage, 1000 steps: climb $\to$ plateau $\to$ collapse")
ax.legend(loc="lower left", fontsize=9)
ax.set_ylim(-0.05, 1.0)
plt.tight_layout(); plt.show()

def min_norm_simplex(grads, n_iters=20):
    """Frank-Wolfe minimizer of ||sum_j alpha_j g_j||^2 over the simplex."""
    G = torch.stack(grads)               # [J, P]
    device, dtype = G.device, G.dtype
    M = G @ G.T                          # [J, J] Gram
    n_tasks = G.shape[0]
    alpha = torch.full((n_tasks,), 1.0 / n_tasks, device=device, dtype=dtype)
    for _ in range(n_iters):
        v = M @ alpha
        j_min = int(v.argmin())
        e = torch.zeros(n_tasks, device=device, dtype=dtype); e[j_min] = 1.0
        d = e - alpha
        num = -(alpha @ (M @ d))
        den = (d @ (M @ d)).clamp(min=1e-12)
        gamma = float((num / den).clamp(0.0, 1.0))
        alpha = alpha + gamma * d
    return alpha


def flat_grad(loss, params, retain_graph):
    g = torch.autograd.grad(loss, params, retain_graph=retain_graph)
    return torch.cat([gi.flatten() for gi in g])


@dataclass
class MGDAConfig:
    outer_steps: int = 500
    group_size: int = 32
    k_targets: int = 4
    sample_steps: int = 25
    lr: float = 1e-4
    beta: float = 5.0
    alpha_ema: float = 0.9         # EMA smoothing on the simplex weights
    use_advantage: bool = False    # if False, signal = beta * r (raw, like §6)
    scale_advantage: bool = False  # if True (with use_advantage), std-scale
    log_every: int = 50


def train_mgda_ram(model, model_ref, reward_fns, cfg: MGDAConfig) -> dict:
    """Multi-reward RAM via MGDA. `reward_fns` is a list of J reward callables."""
    params = list(model.parameters())
    opt = torch.optim.Adam(params, lr=cfg.lr)
    J = len(reward_fns)
    per_reward_hist = [[] for _ in range(J)]
    alpha_hist = []
    alpha_ema = torch.full((J,), 1.0 / J)
    pbar = tqdm(range(cfg.outer_steps), desc="MGDA-RAM", mininterval=2.0)
    for step in pbar:

        # Shared rollout: one set of G endpoints, scored under each reward.
        with torch.no_grad():
            x0 = euler_sample(model, cfg.group_size, n_steps=cfg.sample_steps)
        per_reward = [r(x0) for r in reward_fns]
        for j, r_vals in enumerate(per_reward):
            per_reward_hist[j].append(r_vals.mean().item())

        # Shared (t, eps) batch and one forward pass through v_theta.
        x0_rep = x0.repeat_interleave(cfg.k_targets, dim=0)
        eps = torch.randn_like(x0_rep)
        t = torch.rand(x0_rep.shape[0])
        xt = (1 - t[:, None]) * x0_rep + t[:, None] * eps
        with torch.no_grad():
            v_ref_xt = model_ref(xt, t)
            v_theta_sg = model(xt, t)
            pretrain_target = eps - x0_rep
        v_pred = model(xt, t)            # one forward, graph reused by J backwards

        # J per-reward losses → J per-reward gradients.
        grads = []
        for j, r_vals in enumerate(per_reward):
            if cfg.use_advantage:
                adv = r_vals - r_vals.mean()
                if cfg.scale_advantage:
                    adv = adv / r_vals.std(correction=0).clamp_min(1e-3)
                sig = (cfg.beta * adv).repeat_interleave(cfg.k_targets, dim=0)
            else:
                sig = (cfg.beta * r_vals).repeat_interleave(cfg.k_targets, dim=0)
            target_j = v_ref_xt + sig[:, None] * (pretrain_target - v_theta_sg)
            loss_j = ((v_pred - target_j) ** 2).mean()
            grads.append(flat_grad(loss_j, params, retain_graph=(j < J - 1)))

        # MARBLE recipe: normalize each gradient to unit norm before solving for
        # alpha (so alpha reflects directional conflict, not magnitude disparities),
        # then rescale the combined direction by the mean original norm to keep
        # Adam's natural step size unchanged.
        norms = torch.stack([g.norm() for g in grads]).clamp_min(1e-8)
        grads_unit = [g / n for g, n in zip(grads, norms)]

        # MGDA on the simplex of unit-norm gradients, then EMA smooth.
        alpha_inst = min_norm_simplex(grads_unit).detach()
        alpha_ema = cfg.alpha_ema * alpha_ema + (1 - cfg.alpha_ema) * alpha_inst
        alpha_hist.append(alpha_ema.tolist())
        combined_unit = (alpha_ema[:, None] * torch.stack(grads_unit)).sum(0)
        combined = combined_unit * norms.mean()

        # Assign back as .grad and step.
        opt.zero_grad()
        offset = 0
        for p in params:
            n = p.numel()
            p.grad = combined[offset:offset + n].view_as(p).clone()
            offset += n
        opt.step()

        if step % cfg.log_every == 0:
            pbar.set_postfix(
                **{f"r{j}": f"{per_reward_hist[j][-1]:.3f}" for j in range(J)},
                alpha=f"[{','.join(f'{a:.2f}' for a in alpha_ema.tolist())}]",
            )
    return {"per_reward": per_reward_hist, "alpha": alpha_hist}

def make_radial_reward(center, scale=2.5):
    c = torch.tensor(center, dtype=torch.float32)
    return lambda x: torch.exp(-0.5 * ((x - c) ** 2).sum(-1) / scale**2)


R = 4.0
SETUPS = [
    ("conflicting",     90),
    ("partial overlap", 30),
    ("aligned",          5),
]


def make_setup(phi_deg):
    # Return (centers, rewards, cross_reward) for the two-reward setup
    # with half-spread phi_deg degrees from mode 0 on the ring.
    phi = math.radians(phi_deg)
    centers = [(R * math.cos(+phi), R * math.sin(+phi)),
               (R * math.cos(-phi), R * math.sin(-phi))]
    rewards = [make_radial_reward(c) for c in centers]
    cross_r = math.exp(-(2 * R * math.sin(phi))**2 / (2 * 2.5**2))
    return centers, rewards, cross_r


# Visualize the three reward geometries side-by-side. Each panel shows
# the SUM of the two rewards so both peaks are visible at once.
fig, axes = plt.subplots(1, 3, figsize=(13, 4.2))
for ax, (setup_name, phi_deg) in zip(axes, SETUPS):
    centers, rewards, cross_r = make_setup(phi_deg)
    r_sum = lambda x, rs=rewards: rs[0](x) + rs[1](x)
    plot_density(r_sum, ax,
                 title=f"{setup_name}  ($\\phi=\\pm{phi_deg}°$, cross-r$\\approx${cross_r:.3f})")
    for nm, (cx, cy) in zip(["A", "B"], centers):
        ax.plot(cx, cy, marker="*", markersize=14,
                color="white", markeredgecolor="black")
        ax.text(cx + 0.2, cy + 0.25, nm, color="white", fontsize=11,
                bbox=dict(facecolor="black", alpha=0.5, pad=1))
plt.tight_layout(); plt.show()

BETA = 5.0
STEPS = 1500
reward_names = ["A", "B"]


def fresh_copy_of_ref():
    m = VelocityNet()
    m.load_state_dict(v_ref.state_dict())
    return m


# results[setup_name] = dict(centers, rewards, cross_r, models, scores, samples)
results = {}
for setup_name, phi_deg in SETUPS:
    centers, rewards, cross_r = make_setup(phi_deg)
    models = {}
    for nm, rf in zip(reward_names, rewards):
        torch.manual_seed(31)
        m = fresh_copy_of_ref()
        train_ram(m, v_ref, rf, RAMConfig(
            outer_steps=STEPS, group_size=32, k_targets=4,
            beta=BETA, lr=1e-4, log_every=STEPS,
        ))
        models[f"single-{nm}"] = m
    torch.manual_seed(31)
    mm = fresh_copy_of_ref()
    train_mgda_ram(mm, v_ref, rewards, MGDAConfig(
        outer_steps=STEPS, group_size=32, k_targets=4,
        beta=BETA, lr=1e-4, alpha_ema=0.9, log_every=STEPS,
    ))
    models["multi-MGDA"] = mm

    # Evaluate every model on both rewards using a 3000-sample bag.
    scores = np.zeros((len(models), len(rewards)))
    samples = {}
    for i, (name, m) in enumerate(models.items()):
        m.eval()
        s = euler_sample(m, 3000)
        samples[name] = s
        for j, rf in enumerate(rewards):
            scores[i, j] = rf(s).mean().item()
    results[setup_name] = dict(centers=centers, rewards=rewards, cross_r=cross_r,
                               models=models, scores=scores, samples=samples)

RAM (beta=5.0): 100%|██████████| 1500/1500 [00:03<00:00, 418.89it/s, loss=9.554, mean_r=0.229]

RAM (beta=5.0): 100%|██████████| 1500/1500 [00:03<00:00, 434.09it/s, loss=21.375, mean_r=0.251]

MGDA-RAM: 100%|██████████| 1500/1500 [00:05<00:00, 290.90it/s, alpha=[0.50,0.50], r0=0.229, r1=0.251]

RAM (beta=5.0): 100%|██████████| 1500/1500 [00:03<00:00, 440.38it/s, loss=13.735, mean_r=0.247]

RAM (beta=5.0): 100%|██████████| 1500/1500 [00:03<00:00, 429.96it/s, loss=26.599, mean_r=0.337]

# Single table: 9 rows × {rA, rB, min, avg} columns.
print(f"{'setup':18s} {'cross_r':>8s}  {'model':12s}  "
      f"{'r_A':>6s}   {'r_B':>6s}   {'min':>6s}    {'avg':>6s}")
print("-" * 70)
for setup_name, _ in SETUPS:
    r = results[setup_name]
    for i, name in enumerate(r["models"]):
        row = r["scores"][i]
        print(f"  {setup_name:16s} {r['cross_r']:8.3f}  {name:12s}  "
              f"{row[0]:6.3f}   {row[1]:6.3f}   {row.min():6.3f}    {row.mean():6.3f}")
    print()

setup               cross_r  model            r_A      r_B      min       avg
----------------------------------------------------------------------
  conflicting         0.006  single-A       0.796    0.034    0.034     0.415
  conflicting         0.006  single-B       0.036    0.780    0.036     0.408
  conflicting         0.006  multi-MGDA     0.358    0.364    0.358     0.361

  partial overlap     0.278  single-A       0.704    0.322    0.322     0.513
  partial overlap     0.278  single-B       0.314    0.736    0.314     0.525
  partial overlap     0.278  multi-MGDA     0.535    0.550    0.535     0.542

  aligned             0.962  single-A       0.775    0.765    0.765     0.770
  aligned             0.962  single-B       0.769    0.776    0.769     0.773
  aligned             0.962  multi-MGDA     0.785    0.783    0.783     0.784

# Per-reward bar chart: 1 row × 3 setups. Shared y-axis so the three
# panels can be compared at a glance.
model_names = list(results[SETUPS[0][0]]["models"].keys())
x = np.arange(len(model_names))
width = 0.35

fig, axes = plt.subplots(1, 3, figsize=(15, 4.5), sharey=True)
for ax, (setup_name, _) in zip(axes, SETUPS):
    r = results[setup_name]
    sc = r["scores"]
    ax.bar(x - width / 2, sc[:, 0], width, label=r"$r_A$", color="C0")
    ax.bar(x + width / 2, sc[:, 1], width, label=r"$r_B$", color="C1")
    for xi, v in zip(x - width / 2, sc[:, 0]):
        ax.text(xi, v + 0.015, f"{v:.2f}", ha="center", fontsize=8)
    for xi, v in zip(x + width / 2, sc[:, 1]):
        ax.text(xi, v + 0.015, f"{v:.2f}", ha="center", fontsize=8)
    ax.set_xticks(x); ax.set_xticklabels(model_names, rotation=10)
    ax.set_ylim(0, 0.92)
    ax.set_title(f"{setup_name}  (cross-r $\\approx$ {r['cross_r']:.3f})")
axes[0].set_ylabel("mean reward over 3000 samples")
axes[0].legend(loc="upper right")
plt.tight_layout(); plt.show()

# 3 × 3 grid of endpoint density heatmaps: rows = setups, cols = models.
fig, axes = plt.subplots(3, 3, figsize=(11.5, 11.5))
for row, (setup_name, _) in enumerate(SETUPS):
    r = results[setup_name]
    for col, (name, samples) in enumerate(r["samples"].items()):
        ax = axes[row, col]
        def density(g, s=samples):
            p = kde_density(g, s, h=0.15)
            return p / (p.sum() + 1e-12)
        plot_density(density, ax, title=f"{setup_name} — {name}")
        for (cx, cy), nm in zip(r["centers"], ["A", "B"]):
            ax.plot(cx, cy, marker="*", markersize=12,
                    color="white", markeredgecolor="black")
            ax.text(cx + 0.2, cy + 0.25, nm, color="white", fontsize=9,
                    bbox=dict(facecolor="black", alpha=0.5, pad=1))
plt.tight_layout(); plt.show()

symbol	meaning	how we touch it in code
$p_{\text{ref}}$	What the pretrained model actually produces (close to the ring, but imperfect). This is what RAM regularizes against.	`euler_sample(v_ref, n)` — draw samples
$p_{\text{target}}$	$\propto p_{\text{ref}} \cdot \exp(\beta\, r)$. The KL-optimal post-trained density we're aiming for.	evaluate the density on a 2D grid for plotting; no direct sampler — building one is what RAM is for

setup	$\phi$	peak separation	cross-reward $\approx$
conflicting	$90°$	$180°$	$0.006$
partial	$30°$	$60°$	$0.28$
aligned	$5°$	$10°$	$0.96$

Reinforce Adjoint Matching (RAM) Explained¶

0. Setup¶

1. Two stages of generative learning¶

2. Stage 1: flow-matching pretraining on a 2D toy¶

Flow matching, in three steps¶

Velocity model¶

The training loop¶

Sampling from the pretrained model¶

3. The RL post-training problem¶

4. Building RAM in three pieces¶

4.1 Adjoint matching: the math of the optimal correction¶

4.2 REINFORCE: gradients of expectations without gradients of the integrand¶

4.3 Assembling the RAM loss¶

🎯 Finally, the RAM loss¶

5. Implementation¶

6. Watching RAM tilt the ring¶

7. Ablation studies¶

7.1 The reference anchor — $\beta = 0$ should be a no-op¶

7.2 Tilting strength — bigger $\beta$ tilts harder¶

7.3 Reuse — more $(t, \varepsilon)$ draws per endpoint help¶

7.4 Group-relative advantage — variance reduction¶

7.5 The other side of the advantage signal — over-training collapse¶

8. RAM with multiple rewards¶

8.1 MGDA: per-reward gradients + Frank-Wolfe combiner¶

8.2 Three reward setups: from conflicting to aligned¶

8.3 Train all 9 models and compare¶

8.4 Takeaway¶

9. Recap¶

Why RAM is a promising direction for diffusion RL¶