from __future__ import annotations

import math
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import Markdown, display

torch.manual_seed(0)
np.random.seed(0)
torch.set_default_dtype(torch.float32)

SHOW_PROGRESS = False


def display_markdown_table(headers, rows):
    lines = [
        "| " + " | ".join(headers) + " |",
        "| " + " | ".join(["---"] * len(headers)) + " |",
    ]
    lines.extend("| " + " | ".join(str(x) for x in row) + " |" for row in rows)
    display(Markdown("\n".join(lines)))


print("torch:", torch.__version__)

torch: 2.11.0

def ring_centers(modes: int = 8, radius: float = 4.0):
    angles = torch.arange(modes) * (2 * math.pi / modes)
    return torch.stack([radius * torch.cos(angles), radius * torch.sin(angles)], dim=-1)


def sample_ring(
    n: int,
    *,
    modes: int = 8,
    radius: float = 4.0,
    sigma: float = 0.15,
    return_labels: bool = False,
):
    labels = torch.randint(0, modes, (n,))
    centers = ring_centers(modes, radius)[labels]
    samples = centers + sigma * torch.randn(n, 2)
    if return_labels:
        return samples, labels
    return samples


def sample_source(n: int) -> torch.Tensor:
    return torch.randn(n, 2)


def plot_samples(
    samples,
    ax=None,
    *,
    title=None,
    color="C0",
    s=6,
    alpha=0.45,
    c=None,
    cmap=None,
    xlim=(-5, 5),
    ylim=(-5, 5),
):
    if ax is None:
        _, ax = plt.subplots(figsize=(4, 4))
    pts = samples.detach().cpu()
    if c is None:
        ax.scatter(pts[:, 0], pts[:, 1], s=s, alpha=alpha, color=color)
    else:
        ax.scatter(pts[:, 0], pts[:, 1], s=s, alpha=alpha, c=c, cmap=cmap)
    ax.set_xlim(*xlim)
    ax.set_ylim(*ylim)
    ax.set_aspect("equal")
    if title:
        ax.set_title(title)
    return ax


def make_grid(n_grid=75, xlim=(-5, 5), ylim=(-5, 5)):
    gx = torch.linspace(xlim[0], xlim[1], n_grid)
    gy = torch.linspace(ylim[0], ylim[1], n_grid)
    X, Y = torch.meshgrid(gx, gy, indexing="ij")
    pts = torch.stack([X.flatten(), Y.flatten()], dim=-1)
    return pts, X, Y


def kde_density(grid_pts, samples, h=0.16):
    d2 = ((grid_pts[:, None, :] - samples[None, :, :]) ** 2).sum(-1)
    return torch.exp(-0.5 * d2 / h**2).mean(-1)


def normalize_density(z):
    return z / (z.sum() + 1e-12)


def plot_density_from_samples(samples, ax=None, *, title=None, n_grid=75):
    if ax is None:
        _, ax = plt.subplots(figsize=(4, 4))
    grid, _, _ = make_grid(n_grid=n_grid)
    z = normalize_density(kde_density(grid, samples)).reshape(n_grid, n_grid)
    ax.imshow(z.T.cpu().numpy(), origin="lower", extent=(-5, 5, -5, 5), cmap="viridis")
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)
    ax.set_aspect("equal")
    if title:
        ax.set_title(title)
    return ax


def sliced_wasserstein(samples_a, samples_b, *, n_projections=128, seed=0):
    n = min(len(samples_a), len(samples_b))
    a = samples_a[:n]
    b = samples_b[:n]
    gen = torch.Generator().manual_seed(seed)
    directions = torch.randn(n_projections, 2, generator=gen)
    directions = directions / directions.norm(dim=1, keepdim=True).clamp_min(1e-12)
    pa = torch.sort(a @ directions.T, dim=0).values
    pb = torch.sort(b @ directions.T, dim=0).values
    return torch.sqrt(((pa - pb) ** 2).mean()).item()


def assign_ring_modes(x):
    centers = ring_centers().to(x)
    d2 = ((x[:, None, :] - centers[None, :, :]) ** 2).sum(-1)
    return d2.argmin(-1)


def mode_js(samples_a, samples_b, eps=1e-8):
    labels_a = assign_ring_modes(samples_a)
    labels_b = assign_ring_modes(samples_b)
    pa = torch.bincount(labels_a, minlength=8).float()
    pb = torch.bincount(labels_b, minlength=8).float()
    pa = pa / pa.sum()
    pb = pb / pb.sum()

    def kl(p, q):
        p = (p + eps) / (p.sum() + eps * p.numel())
        q = (q + eps) / (q.sum() + eps * q.numel())
        return (p * (p.log() - q.log())).sum()

    m = 0.5 * (pa + pb)
    return (0.5 * kl(pa, m) + 0.5 * kl(pb, m)).item()

ring_samples, ring_labels = sample_ring(3000, return_labels=True)
noise_samples = sample_source(3000)

fig, axes = plt.subplots(1, 2, figsize=(8.5, 4))
plot_samples(noise_samples, axes[0], title="source: Gaussian noise", color="0.25")
plot_samples(
    ring_samples,
    axes[1],
    title="data: 8-mode ring",
    c=ring_labels.numpy(),
    cmap="tab10",
)
plt.tight_layout()
plt.show()

class TimeEmbedding(nn.Module):
    def __init__(self, n_freqs: int = 8):
        super().__init__()
        self.register_buffer("freqs", 2 ** torch.arange(n_freqs).float() * math.pi)

    def forward(self, t: torch.Tensor) -> torch.Tensor:
        angles = t[:, None] * self.freqs[None, :]
        return torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1)


class VelocityNet(nn.Module):
    def __init__(self, hidden: int = 192, n_freqs: int = 8):
        super().__init__()
        self.time_emb = TimeEmbedding(n_freqs)
        in_dim = 2 + 2 * n_freqs
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, 2),
        )

    def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
        return self.net(torch.cat([x, self.time_emb(t)], dim=-1))


def num_params(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters())


def train_flow_matching(
    model: VelocityNet,
    *,
    steps: int = 8000,
    batch_size: int = 256,
    lr: float = 1e-3,
):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    losses = []
    pbar = tqdm(range(steps), desc="teacher flow matching", disable=not SHOW_PROGRESS, leave=False)
    for step in pbar:
        x0 = sample_ring(batch_size)
        eps = sample_source(batch_size)
        t = torch.rand(batch_size)
        xt = (1 - t[:, None]) * x0 + t[:, None] * eps
        target = eps - x0

        pred = model(xt, t)
        loss = ((pred - target) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.item())
    return losses


@torch.no_grad()
def euler_sample_velocity(
    model: VelocityNet,
    start: torch.Tensor,
    *,
    n_steps: int = 250,
):
    model.eval()
    x = start.clone()
    n = x.shape[0]
    dt = 1.0 / n_steps
    for k in range(n_steps):
        t = torch.full((n,), 1.0 - k * dt)
        x = x - model(x, t) * dt
    return x


v_teacher = VelocityNet()
teacher_losses = train_flow_matching(v_teacher)
v_teacher.eval()
for p in v_teacher.parameters():
    p.requires_grad_(False)

display_markdown_table(
    ["Quantity", "Value"],
    [
        ["Velocity model parameters", f"{num_params(v_teacher):,}"],
        ["Training steps", f"{len(teacher_losses):,}"],
    ],
)

fig, axes = plt.subplots(1, 2, figsize=(10, 3.6))
axes[0].plot(teacher_losses, color="0.8", alpha=0.35, label="raw")

def moving_average(values, window=100):
    values = np.asarray(values)
    if len(values) < window:
        return values
    kernel = np.ones(window) / window
    return np.convolve(values, kernel, mode="valid")


axes[0].plot(moving_average(teacher_losses, window=100), color="C0", label="moving average")
axes[0].set_title("teacher loss (100-point moving average)")
axes[0].set_xlabel("step")
axes[0].set_ylabel("MSE")
axes[0].set_yscale("log")
axes[0].legend()

# Shared evaluation batches used by all metric tables below.
EVAL_NOISE = sample_source(5000)
TRUE_RING_EVAL = sample_ring(5000)
SLICED_W_SEED = 77
teacher_samples_250 = euler_sample_velocity(v_teacher, EVAL_NOISE, n_steps=250)

plot_samples(teacher_samples_250[:3000], axes[1], title="teacher samples (250 Euler steps)", color="teal")
plt.tight_layout()
plt.show()

display_markdown_table(
    ["Check", "Value"],
    [
        ["Sliced W: teacher vs true ring", f"{sliced_wasserstein(teacher_samples_250, TRUE_RING_EVAL, seed=SLICED_W_SEED):.4f}"],
    ],
)

teacher_eval_rows = []
for nfe in [250, 8, 4, 2]:
    samples = euler_sample_velocity(v_teacher, EVAL_NOISE, n_steps=nfe)
    sw_ring = sliced_wasserstein(samples, TRUE_RING_EVAL, seed=SLICED_W_SEED)
    teacher_eval_rows.append([
        f"Euler velocity ({nfe} NFE)",
        nfe,
        f"{sw_ring:.4f}",
    ])

display_markdown_table(
    [
        "Sampler",
        "NFE",
        "Sliced W vs true ring",
    ],
    teacher_eval_rows,
)

fig, axes = plt.subplots(1, 4, figsize=(14, 3.6))
for ax, nfe in zip(axes, [250, 8, 4, 2]):
    samples = euler_sample_velocity(v_teacher, sample_source(2500), n_steps=nfe)
    plot_samples(samples, ax, title=f"Euler velocity\n{nfe} NFE", color="teal")
plt.tight_layout()
plt.show()

class FlowMapNet(nn.Module):
    def __init__(self, hidden: int = 192, n_freqs: int = 8):
        super().__init__()
        self.time_emb = TimeEmbedding(n_freqs)
        in_dim = 2 + 4 * n_freqs
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.SiLU(),
            nn.Linear(hidden, hidden),
            nn.SiLU(),
            nn.Linear(hidden, 2),
        )

    def average_velocity(self, x: torch.Tensor, s: torch.Tensor, t: torch.Tensor):
        return self.net(torch.cat([x, self.time_emb(s), self.time_emb(t)], dim=-1))

    def forward(self, x: torch.Tensor, s: torch.Tensor, t: torch.Tensor):
        return x + (t - s)[:, None] * self.average_velocity(x, s, t)


@torch.no_grad()
def sample_flow_map(model: FlowMapNet, start: torch.Tensor, *, n_steps: int):
    x = start.clone()
    n = x.shape[0]
    times = torch.linspace(1.0, 0.0, n_steps + 1)
    for k in range(n_steps):
        s = torch.full((n,), times[k].item())
        t = torch.full((n,), times[k + 1].item())
        x = model(x, s, t)
    return x


display_markdown_table(
    ["Quantity", "Value"],
    [["Flow-map parameters", f"{num_params(FlowMapNet()):,}"]],
)

def update_ema_model(ema_model: FlowMapNet, model: FlowMapNet, decay: float = 0.995):
    with torch.no_grad():
        for ema_param, param in zip(ema_model.parameters(), model.parameters()):
            ema_param.mul_(decay).add_(param, alpha=1 - decay)


def make_ema_model(model: FlowMapNet) -> FlowMapNet:
    ema_model = FlowMapNet()
    ema_model.load_state_dict(model.state_dict())
    ema_model.eval()
    for p in ema_model.parameters():
        p.requires_grad_(False)
    return ema_model


def sample_data_interpolant_batch(batch_size: int = 512):
    x0 = sample_ring(batch_size)
    eps = sample_source(batch_size)
    s = 0.05 + 0.95 * torch.rand(batch_size)
    t = s * torch.rand(batch_size)
    mix = 0.2 + 0.6 * torch.rand(batch_size)
    m = t + mix * (s - t)
    x_s = (1 - s[:, None]) * x0 + s[:, None] * eps
    return x0, eps, x_s, s, m, t


def diagonal_teacher_loss(model: FlowMapNet, batch_size: int = 512):
    _, _, x_s, s, _, _ = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        target_velocity = v_teacher(x_s, s)
    return ((model.average_velocity(x_s, s, s) - target_velocity) ** 2).mean()


def route_a_semigroup_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 512):
    _, _, x_s, s, m, t = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        target = target_model(target_model(x_s, s, m), m, t)
    return ((model(x_s, s, t) - target) ** 2).mean()


def route_a_lmd_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 512):
    _, _, x_s, s, m, t = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        x_m = target_model(x_s, s, m)
        target = x_m + (t - m)[:, None] * v_teacher(x_m, m)
    return ((model(x_s, s, t) - target) ** 2).mean()


def route_a_emd_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 512):
    _, _, x_s, s, m, t = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        x_m = x_s + (m - s)[:, None] * v_teacher(x_s, s)
        target = target_model(x_m, m, t)
    return ((model(x_s, s, t) - target) ** 2).mean()


def train_route_a(
    kind: str,
    *,
    steps: int = 6000,
    batch_size: int = 512,
    diag_fraction: float = 0.3,
):
    model = FlowMapNet()
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    target_model = make_ema_model(model)

    diag_bs = max(1, int(batch_size * diag_fraction))
    offdiag_bs = batch_size - diag_bs

    for _ in tqdm(range(steps), desc=f"Route A {kind}", disable=not SHOW_PROGRESS, leave=False):
        loss = diag_fraction * diagonal_teacher_loss(model, batch_size=diag_bs)

        if kind == "semigroup":
            consistency = route_a_semigroup_loss(model, target_model, batch_size=offdiag_bs)
        elif kind == "lmd":
            consistency = route_a_lmd_loss(model, target_model, batch_size=offdiag_bs)
        elif kind == "emd":
            consistency = route_a_emd_loss(model, target_model, batch_size=offdiag_bs)
        else:
            raise ValueError(kind)

        loss = loss + (1 - diag_fraction) * consistency
        opt.zero_grad()
        loss.backward()
        opt.step()
        update_ema_model(target_model, model)

    model.eval()
    return model


METHOD_SEED = 41


torch.manual_seed(METHOD_SEED)
route_a_semigroup = train_route_a("semigroup")
torch.manual_seed(METHOD_SEED)
route_a_lmd = train_route_a("lmd")
torch.manual_seed(METHOD_SEED)
route_a_emd = train_route_a("emd")

display_markdown_table(
    ["Route A method", "Uses full teacher trajectories?", "Training signal"],
    [
        ["Semigroup", "no", "EMA target: two jumps match one jump"],
        ["LMD", "no", "EMA target + one local teacher step at predicted endpoint"],
        ["EMD", "no", "teacher-moved source + EMA target map"],
    ],
)

def diagonal_data_loss(model: FlowMapNet, batch_size: int = 256):
    x0, eps, x_s, s, _, _ = sample_data_interpolant_batch(batch_size)
    target_velocity = eps - x0
    return ((model.average_velocity(x_s, s, s) - target_velocity) ** 2).mean()


def route_b_psd_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 256):
    _, _, x_s, s, m, t = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        target = target_model(target_model(x_s, s, m), m, t)
    return ((model(x_s, s, t) - target) ** 2).mean()


def route_b_lsd_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 128):
    _, _, x_s, s, _, t = sample_data_interpolant_batch(batch_size)

    # Lagrangian view:
    # f(t) = F(x_s, s, t). Nudging t in direction 1 gives dF/dt.
    def flow_as_function_of_t(t_in):
        return model(x_s, s, t_in)

    x_t_pred, dF_dt = torch.func.jvp(
        flow_as_function_of_t,
        (t,),
        (torch.ones_like(t),),
    )

    with torch.no_grad():
        # On the diagonal, average velocity V(x, t, t) acts like the local velocity.
        target_velocity = target_model.average_velocity(x_t_pred.detach(), t, t)
    return ((dF_dt - target_velocity) ** 2).mean()


def route_b_esd_loss(model: FlowMapNet, target_model: FlowMapNet, batch_size: int = 128):
    _, _, x_s, s, _, t = sample_data_interpolant_batch(batch_size)
    with torch.no_grad():
        source_velocity = target_model.average_velocity(x_s, s, s)

    # Eulerian view:
    # f(x_s, s) = F(x_s, s, t). Nudging (x_s, s) by (v, 1) gives
    # ∇_x F · v + ∂_s F, which should be zero for a consistent map.
    def flow_as_function_of_source(x_in, s_in):
        return model(x_in, s_in, t)

    _, directional_derivative = torch.func.jvp(
        flow_as_function_of_source,
        (x_s, s),
        (source_velocity, torch.ones_like(s)),
    )
    return (directional_derivative ** 2).mean()


def train_route_b(
    kind: str,
    *,
    steps: int = 6000,
    batch_size: int = 512,
    diag_fraction: float = 0.3,
):
    model = FlowMapNet()
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    target_model = make_ema_model(model)

    diag_bs = max(1, int(batch_size * diag_fraction))
    offdiag_bs = batch_size - diag_bs

    for _ in tqdm(range(steps), desc=f"Route B {kind}", disable=not SHOW_PROGRESS, leave=False):
        loss = diag_fraction * diagonal_data_loss(model, batch_size=diag_bs)

        if kind == "psd":
            consistency = route_b_psd_loss(model, target_model, batch_size=offdiag_bs)
        elif kind == "lsd":
            consistency = 0.02 * route_b_lsd_loss(model, target_model, batch_size=max(64, offdiag_bs // 4))
        elif kind == "esd":
            consistency = 0.1 * route_b_esd_loss(model, target_model, batch_size=max(64, offdiag_bs // 4))
        else:
            raise ValueError(kind)

        loss = loss + (1 - diag_fraction) * consistency
        opt.zero_grad()
        loss.backward()
        opt.step()
        update_ema_model(target_model, model)

    model.eval()
    return model


torch.manual_seed(METHOD_SEED)
route_b_psd = train_route_b("psd")
torch.manual_seed(METHOD_SEED)
route_b_lsd = train_route_b("lsd")
torch.manual_seed(METHOD_SEED)
route_b_esd = train_route_b("esd")

display_markdown_table(
    ["Route B method", "Separate teacher?", "Key mechanism"],
    [
        ["PSD", "no", "semigroup / progressive self-distillation"],
        ["LSD", "no", "Lagrangian JVP: $\\partial_t F$"],
        ["ESD", "no", "Eulerian JVP: $\\partial_s F + \\nabla_xFv$"],
    ],
)

comparison_noise = EVAL_NOISE
comparison_ring = TRUE_RING_EVAL
nfe_values = [1, 2, 4, 8]

method_specs = [
    ("Coarse Euler", lambda nfe: euler_sample_velocity(v_teacher, comparison_noise, n_steps=nfe), "C3"),
    ("Route A: Semigroup", lambda nfe: sample_flow_map(route_a_semigroup, comparison_noise, n_steps=nfe), "C1"),
    ("Route A: LMD", lambda nfe: sample_flow_map(route_a_lmd, comparison_noise, n_steps=nfe), "C5"),
    ("Route A: EMD", lambda nfe: sample_flow_map(route_a_emd, comparison_noise, n_steps=nfe), "C6"),
    ("Route B: PSD", lambda nfe: sample_flow_map(route_b_psd, comparison_noise, n_steps=nfe), "C0"),
    ("Route B: LSD", lambda nfe: sample_flow_map(route_b_lsd, comparison_noise, n_steps=nfe), "C2"),
    ("Route B: ESD", lambda nfe: sample_flow_map(route_b_esd, comparison_noise, n_steps=nfe), "C4"),
]

comparison_metrics = []
for method_name, sampler, color in method_specs:
    for nfe in nfe_values:
        samples = sampler(nfe)
        sw = sliced_wasserstein(samples, comparison_ring, seed=SLICED_W_SEED)
        comparison_metrics.append({
            "method": method_name,
            "nfe": nfe,
            "sw": sw,
            "color": color,
        })

best_sw_by_nfe = {
    nfe: min(row["sw"] for row in comparison_metrics if row["nfe"] == nfe)
    for nfe in nfe_values
}

comparison_rows = []
for method_name, _, _ in method_specs:
    row = [method_name]
    for nfe in nfe_values:
        metric = next(r for r in comparison_metrics if r["method"] == method_name and r["nfe"] == nfe)
        sw_text = f"{metric['sw']:.4f}"
        if abs(metric["sw"] - best_sw_by_nfe[nfe]) < 1e-8:
            sw_text = f"**{sw_text}**"
        row.append(sw_text)
    comparison_rows.append(row)

display_markdown_table(
    ["Method"] + [f"{nfe} NFE" for nfe in nfe_values],
    comparison_rows,
)

fig, ax = plt.subplots(figsize=(8.5, 4.8))
for method_name, _, color in method_specs:
    rows = [r for r in comparison_metrics if r["method"] == method_name]
    rows = sorted(rows, key=lambda r: r["nfe"])
    ax.plot(
        [r["nfe"] for r in rows],
        [r["sw"] for r in rows],
        marker="o",
        label=method_name,
        color=color,
    )

ax.set_title("Sample quality vs NFE")
ax.set_xlabel("NFE")
ax.set_ylabel("Sliced W vs true ring")
ax.set_xticks(nfe_values)
ax.grid(alpha=0.25)
ax.legend(fontsize=8, ncol=2)
plt.tight_layout()
plt.show()

plot_noise = sample_source(1200)
grid_method_specs = [
    ("Coarse Euler", lambda nfe: euler_sample_velocity(v_teacher, plot_noise, n_steps=nfe), "C3"),
    ("A: Semigroup", lambda nfe: sample_flow_map(route_a_semigroup, plot_noise, n_steps=nfe), "C1"),
    ("A: LMD", lambda nfe: sample_flow_map(route_a_lmd, plot_noise, n_steps=nfe), "C5"),
    ("A: EMD", lambda nfe: sample_flow_map(route_a_emd, plot_noise, n_steps=nfe), "C6"),
    ("B: PSD", lambda nfe: sample_flow_map(route_b_psd, plot_noise, n_steps=nfe), "C0"),
    ("B: LSD", lambda nfe: sample_flow_map(route_b_lsd, plot_noise, n_steps=nfe), "C2"),
    ("B: ESD", lambda nfe: sample_flow_map(route_b_esd, plot_noise, n_steps=nfe), "C4"),
]

fig, axes = plt.subplots(len(grid_method_specs), len(nfe_values), figsize=(12, 18))
for row, (method_name, sampler, color) in enumerate(grid_method_specs):
    for col, nfe in enumerate(nfe_values):
        samples = sampler(nfe)
        title = f"{nfe} NFE" if row > 0 else f"{nfe} NFE"
        plot_samples(samples, axes[row, col], title=title, color=color, s=4, alpha=0.35)
        if col == 0:
            axes[row, col].set_ylabel(method_name, fontsize=10)

plt.tight_layout()
plt.show()

def reward(x: torch.Tensor, center=(4.0, 0.0), scale: float = 2.5):
    c = torch.as_tensor(center, dtype=x.dtype, device=x.device)
    return torch.exp(-0.5 * ((x - c) ** 2).sum(-1) / scale**2)


TILT_BETA = 4.0


@torch.no_grad()
def make_velocity_reference_endpoint_pool(ref_velocity: VelocityNet, n: int = 8000, n_steps: int = 250):
    return euler_sample_velocity(ref_velocity, sample_source(n), n_steps=n_steps)


@torch.no_grad()
def sample_target_from_reference_pool(pool: torch.Tensor, n: int, beta: float = TILT_BETA, seed: int = 0):
    weights = torch.exp(beta * reward(pool))
    probs = weights / weights.sum()
    gen = torch.Generator().manual_seed(seed)
    idx = torch.multinomial(probs, n, replacement=True, generator=gen)
    return pool[idx]


def sample_ref_interpolant_batch(endpoint_pool: torch.Tensor, batch_size: int = 512):
    idx = torch.randint(len(endpoint_pool), (batch_size,))
    x0 = endpoint_pool[idx]
    eps = sample_source(batch_size)
    s = 0.05 + 0.95 * torch.rand(batch_size)
    t = s * torch.rand(batch_size)
    mix = 0.2 + 0.6 * torch.rand(batch_size)
    m = t + mix * (s - t)
    x_s = (1 - s[:, None]) * x0 + s[:, None] * eps
    xdot = eps - x0
    return x0, eps, x_s, xdot, s, m, t


def itm_diagonal_loss(model: FlowMapNet, ref_velocity: VelocityNet, endpoint_pool: torch.Tensor, batch_size: int = 512):
    x0, eps, x_s, xdot, s, _, _ = sample_ref_interpolant_batch(endpoint_pool, batch_size)
    pred = model.average_velocity(x_s, s, s)
    with torch.no_grad():
        v_ref = ref_velocity(x_s, s)
        v_sg = model.average_velocity(x_s, s, s)
        coeff = torch.expm1(TILT_BETA * reward(x0))
        target = v_ref + coeff[:, None] * (xdot - v_sg)
    return ((pred - target) ** 2).mean()


def semigroup_loss_on_ref_pool(model: FlowMapNet, target_model: FlowMapNet, endpoint_pool: torch.Tensor, batch_size: int = 512):
    _, _, x_s, _, s, m, t = sample_ref_interpolant_batch(endpoint_pool, batch_size)
    with torch.no_grad():
        target = target_model(target_model(x_s, s, m), m, t)
    pred = model(x_s, s, t)
    return ((pred - target) ** 2).mean()


def train_reward_tilted_flow_map(ref_velocity: VelocityNet, steps: int = 5000, batch_size: int = 512):
    model = FlowMapNet()
    target_model = make_ema_model(model)
    for p in ref_velocity.parameters():
        p.requires_grad_(False)

    endpoint_pool = make_velocity_reference_endpoint_pool(ref_velocity)
    opt = torch.optim.Adam(model.parameters(), lr=5e-4)
    losses = []
    pbar = tqdm(range(steps), desc="reward-tilted flow map", disable=not SHOW_PROGRESS, leave=False)
    for _ in pbar:
        loss_itm = itm_diagonal_loss(model, ref_velocity, endpoint_pool, batch_size)
        loss_semigroup = semigroup_loss_on_ref_pool(model, target_model, endpoint_pool, batch_size)
        loss = 0.5 * loss_itm + 0.5 * loss_semigroup
        opt.zero_grad()
        loss.backward()
        opt.step()
        update_ema_model(target_model, model)
        losses.append(loss.item())
    model.eval()
    return model, losses, endpoint_pool


torch.manual_seed(314)
tilted_flow_map, tilted_losses, reference_endpoint_pool = train_reward_tilted_flow_map(v_teacher)

target_tilt_samples = sample_target_from_reference_pool(reference_endpoint_pool, 5000, seed=2026)
base_samples = euler_sample_velocity(v_teacher, EVAL_NOISE, n_steps=250)
tilted_samples_by_nfe = {
    nfe: sample_flow_map(tilted_flow_map, EVAL_NOISE, n_steps=nfe)
    for nfe in [1, 2, 4, 8]
}


def evaluate_reward_sampler(name: str, samples: torch.Tensor):
    return {
        "name": name,
        "mean_reward": reward(samples).mean().item(),
        "sliced_w_target": sliced_wasserstein(samples, target_tilt_samples, seed=SLICED_W_SEED),
    }


reward_evals = [
    {"name": "base velocity teacher", "nfe": "250", **evaluate_reward_sampler("base velocity teacher", base_samples)},
    {"name": "analytic tilted target", "nfe": "-", **evaluate_reward_sampler("analytic tilted target", target_tilt_samples)},
]

flow_map_reward_evals = []
for nfe, samples in tilted_samples_by_nfe.items():
    flow_map_reward_evals.append({
        "name": "one-stage ITM + semigroup flow map",
        "nfe": str(nfe),
        **evaluate_reward_sampler("one-stage ITM + semigroup flow map", samples),
    })
best_flow_map_sw = min(e["sliced_w_target"] for e in flow_map_reward_evals)
reward_evals.extend(flow_map_reward_evals)

display_markdown_table(
    ["Sampler", "NFE", "Mean reward ↑", "Sliced W to tilted target ↓"],
    [
        [
            e["name"],
            e["nfe"],
            f"{e['mean_reward']:.3f}",
            (
                f"**{e['sliced_w_target']:.4f}**"
                if e["name"].startswith("one-stage") and abs(e["sliced_w_target"] - best_flow_map_sw) < 1e-8
                else f"{e['sliced_w_target']:.4f}"
            ),
        ]
        for e in reward_evals
    ],
)

def plot_reward_heatmap(ax, n_grid: int = 80):
    grid, _, _ = make_grid(n_grid=n_grid)
    z = reward(grid).reshape(n_grid, n_grid)
    ax.imshow(z.T.cpu().numpy(), origin="lower", extent=(-5, 5, -5, 5), cmap="magma")
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)
    ax.set_aspect("equal")
    ax.set_title("reward region $r(x)$")
    return ax


best_nfe = min(
    tilted_samples_by_nfe,
    key=lambda nfe: sliced_wasserstein(tilted_samples_by_nfe[nfe], target_tilt_samples, seed=SLICED_W_SEED),
)
best_tilted_samples = tilted_samples_by_nfe[best_nfe]
best_eval = next(e for e in flow_map_reward_evals if e["nfe"] == str(best_nfe))

fig, axes = plt.subplots(1, 4, figsize=(15.5, 3.8))
plot_reward_heatmap(axes[0])
plot_samples(base_samples[:2500], axes[1], title=f"base velocity teacher\nmean r={reward_evals[0]['mean_reward']:.2f}", color="C1")
plot_samples(target_tilt_samples[:2500], axes[2], title=f"tilted target\nmean r={reward_evals[1]['mean_reward']:.2f}", color="C3")
plot_samples(best_tilted_samples[:2500], axes[3], title=f"best tilted map ({best_nfe} NFE)\nmean r={best_eval['mean_reward']:.2f}", color="C2")
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 4, figsize=(14, 3.6))
for ax, nfe in zip(axes, [1, 2, 4, 8]):
    samples = tilted_samples_by_nfe[nfe]
    eval_row = next(e for e in flow_map_reward_evals if e["nfe"] == str(nfe))
    plot_samples(samples[:2000], ax, title=f"tilted map\n{nfe} NFE, r={eval_row['mean_reward']:.2f}", color="C2")
plt.tight_layout()
plt.show()

Sampler	NFE	Sliced W vs true ring
Euler velocity (250 NFE)	250	0.2533
Euler velocity (8 NFE)	8	0.3226
Euler velocity (4 NFE)	4	0.3256
Euler velocity (2 NFE)	2	0.9366

Method	1 NFE	2 NFE	4 NFE	8 NFE
Coarse Euler	2.7306	0.9366	0.3256	0.3226
Route A: Semigroup	0.2838	0.2390	0.2371	0.2571
Route A: LMD	0.5596	0.3746	0.3312	0.3391
Route A: EMD	0.9457	0.3703	0.2554	0.2576
Route B: PSD	0.4244	0.2938	0.3231	0.3110
Route B: LSD	0.3695	0.3060	0.3651	0.3800
Route B: ESD	0.4797	0.3304	0.3487	0.3581

Sampler	NFE	Mean reward ↑	Sliced W to tilted target ↓
base velocity teacher	250	0.274	3.0995
analytic tilted target	-	0.829	0.0000
one-stage ITM + semigroup flow map	1	0.719	0.8203
one-stage ITM + semigroup flow map	2	0.700	0.9119
one-stage ITM + semigroup flow map	4	0.783	0.4831
one-stage ITM + semigroup flow map	8	0.815	0.2662

Flow Map 101¶

0. Setup¶

1. Start with a velocity field¶

2. A flow map predicts the integral, not the tangent¶

3. The three consistency ideas¶

1. Semigroup / compositional consistency¶

2. Lagrangian consistency via LMD¶

3. Eulerian consistency via EMD¶

4. Route A: pretrained teacher $\rightarrow$ flow map¶

5. Route B: direct/self-distilled flow map¶

6. Comparison¶

7. Extension: one-stage reward-tilted flow maps¶

8. Recap¶

Route A method	Uses full teacher trajectories?	Training signal
Semigroup	no	EMA target: two jumps match one jump
LMD	no	EMA target + one local teacher step at predicted endpoint
EMD	no	teacher-moved source + EMA target map

Route B method	Separate teacher?	Key mechanism
PSD	no	semigroup / progressive self-distillation
LSD	no	Lagrangian JVP: $\partial_t F$
ESD	no	Eulerian JVP: $\partial_s F + \nabla_xFv$

Quantity	Value
Velocity model parameters	41,090
Training steps	8,000