From ed8ed95be4a36fa88fad7ec835b1218faddf5fd5 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:43:50 +0100
Subject: [PATCH 01/27] Add benchmarking suite to linopy to track peformance

---
 .gitignore                         |   1 +
 benchmarks/README.md               |  53 ++++++++++++
 benchmarks/__init__.py             |   1 +
 benchmarks/compare.py              |  95 +++++++++++++++++++++
 benchmarks/models/__init__.py      |  34 ++++++++
 benchmarks/models/basic.py         |  21 +++++
 benchmarks/models/knapsack.py      |  26 ++++++
 benchmarks/models/large_expr.py    |  37 +++++++++
 benchmarks/models/pypsa_scigrid.py |  26 ++++++
 benchmarks/models/sparse.py        |  61 ++++++++++++++
 benchmarks/run.py                  | 128 +++++++++++++++++++++++++++++
 benchmarks/runners/__init__.py     |  21 +++++
 benchmarks/runners/build.py        |  57 +++++++++++++
 benchmarks/runners/lp_write.py     |  60 ++++++++++++++
 benchmarks/runners/memory.py       |  70 ++++++++++++++++
 justfile                           |  34 ++++++++
 16 files changed, 725 insertions(+)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/compare.py
 create mode 100644 benchmarks/models/__init__.py
 create mode 100644 benchmarks/models/basic.py
 create mode 100644 benchmarks/models/knapsack.py
 create mode 100644 benchmarks/models/large_expr.py
 create mode 100644 benchmarks/models/pypsa_scigrid.py
 create mode 100644 benchmarks/models/sparse.py
 create mode 100644 benchmarks/run.py
 create mode 100644 benchmarks/runners/__init__.py
 create mode 100644 benchmarks/runners/build.py
 create mode 100644 benchmarks/runners/lp_write.py
 create mode 100644 benchmarks/runners/memory.py
 create mode 100644 justfile

diff --git a/.gitignore b/.gitignore
index 7b962a6b..263d41fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ ENV/
 env.bak/
 venv.bak/
 
+benchmarks/results/
 benchmark/*.pdf
 benchmark/benchmarks
 benchmark/.snakemake
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..bdacd115
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,53 @@
+# Benchmarks
+
+Modular benchmark framework for linopy. All commands use [`just`](https://github.com/casey/just).
+
+## Quick Start
+
+```bash
+# Install just (macOS)
+brew install just
+
+# List available models and phases
+just bench-list
+
+# Quick smoke test
+just bench-quick
+
+# Full benchmark suite
+just bench label="my-branch"
+
+# Single phase
+just bench-build label="my-branch"
+just bench-memory label="my-branch"
+just bench-write label="my-branch"
+
+# Single model + phase
+just bench-model basic build label="my-branch"
+
+# Compare two runs
+just bench-compare benchmarks/results/old_basic_build.json benchmarks/results/new_basic_build.json
+```
+
+## Models
+
+| Name | Description |
+|------|-------------|
+| `basic` | 2×N² vars/cons — simple dense model |
+| `knapsack` | N binary variables — integer programming |
+| `pypsa_scigrid` | Real power system from PyPSA SciGrid-DE |
+| `sparse` | Sparse ring network — exercises alignment |
+| `large_expr` | Many-term expressions — stress test |
+
+## Phases
+
+| Name | Description |
+|------|-------------|
+| `build` | Model construction speed (time) |
+| `memory` | Peak memory via tracemalloc |
+| `lp_write` | LP file writing speed |
+
+## Output
+
+Results are saved as JSON files in `benchmarks/results/` (gitignored).
+Pattern: `{label}_{model}_{phase}.json`
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..b2d71789
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Linopy benchmark framework."""
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
new file mode 100644
index 00000000..4dbe56ec
--- /dev/null
+++ b/benchmarks/compare.py
@@ -0,0 +1,95 @@
+"""Compare two benchmark result JSON files and produce a plot."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def compare(old_path: str, new_path: str) -> None:
+    """Load two result JSONs and produce a comparison PNG."""
+    import matplotlib.pyplot as plt
+
+    with open(old_path) as f:
+        old = json.load(f)
+    with open(new_path) as f:
+        new = json.load(f)
+
+    old_label = old.get("label", Path(old_path).stem)
+    new_label = new.get("label", Path(new_path).stem)
+    phase = old.get("phase", "unknown")
+    model_name = old.get("model", "unknown")
+
+    old_runs = old.get("runs", [])
+    new_runs = new.get("runs", [])
+
+    if not old_runs or not new_runs:
+        print("No runs to compare.")
+        return
+
+    # Find the primary metric based on phase
+    metric_keys = {
+        "build": "build_time_median_s",
+        "memory": "peak_memory_median_mb",
+        "lp_write": "write_time_median_s",
+    }
+    metric = metric_keys.get(phase)
+    if metric is None:
+        # Try to auto-detect
+        for key in old_runs[0]:
+            if "median" in key:
+                metric = key
+                break
+    if metric is None:
+        print("Cannot determine metric to plot.")
+        return
+
+    c_old, c_new = "#1b9e77", "#d95f02"
+
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+    # Panel 1: Absolute values
+    ax = axes[0]
+    x_old = list(range(len(old_runs)))
+    x_new = list(range(len(new_runs)))
+    y_old = [r.get(metric, 0) for r in old_runs]
+    y_new = [r.get(metric, 0) for r in new_runs]
+    labels_old = [str(r.get("params", {})) for r in old_runs]
+
+    ax.plot(x_old, y_old, "o-", color=c_old, label=old_label, linewidth=2, markersize=8)
+    ax.plot(
+        x_new, y_new, "s--", color=c_new, label=new_label, linewidth=2, markersize=8
+    )
+    ax.set_xticks(x_old)
+    ax.set_xticklabels(labels_old, rotation=45, ha="right", fontsize=7)
+    ax.set_ylabel(metric)
+    ax.set_title(f"{model_name} / {phase}: {metric}")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Panel 2: Ratio (new / old)
+    ax = axes[1]
+    n_compare = min(len(old_runs), len(new_runs))
+    ratios = []
+    for i in range(n_compare):
+        vo = old_runs[i].get(metric, 0)
+        vn = new_runs[i].get(metric, 0)
+        ratios.append(vn / vo if vo > 0 else float("nan"))
+
+    ax.bar(range(n_compare), ratios, color=c_new, alpha=0.7)
+    ax.axhline(1.0, color="k", linestyle="--", linewidth=1.5, alpha=0.6)
+    ax.set_xticks(range(n_compare))
+    ax.set_xticklabels(labels_old[:n_compare], rotation=45, ha="right", fontsize=7)
+    ax.set_ylabel(f"Ratio ({new_label} / {old_label})")
+    ax.set_title("Relative performance")
+    ax.grid(True, alpha=0.3)
+
+    fig.suptitle(
+        f"Benchmark Comparison: {model_name} / {phase}", fontsize=13, fontweight="bold"
+    )
+    fig.tight_layout()
+
+    out_png = Path(old_path).parent / f"compare_{model_name}_{phase}.png"
+    plt.savefig(out_png, dpi=150)
+    print(f"Saved: {out_png}")
+    plt.close()
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
new file mode 100644
index 00000000..c9ce1393
--- /dev/null
+++ b/benchmarks/models/__init__.py
@@ -0,0 +1,34 @@
+"""Model registry for benchmarks."""
+
+from __future__ import annotations
+
+import importlib
+import pkgutil
+from types import ModuleType
+
+_MODELS: dict[str, ModuleType] = {}
+
+
+def _discover() -> None:
+    """Auto-discover model modules in this package."""
+    if _MODELS:
+        return
+    package = importlib.import_module("benchmarks.models")
+    for info in pkgutil.iter_modules(package.__path__):
+        if info.name.startswith("_"):
+            continue
+        mod = importlib.import_module(f"benchmarks.models.{info.name}")
+        if hasattr(mod, "build") and hasattr(mod, "SIZES"):
+            _MODELS[info.name] = mod
+
+
+def get_model(name: str) -> ModuleType:
+    """Return a model module by name."""
+    _discover()
+    return _MODELS[name]
+
+
+def list_models() -> list[str]:
+    """Return sorted list of available model names."""
+    _discover()
+    return sorted(_MODELS)
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
new file mode 100644
index 00000000..f96abcec
--- /dev/null
+++ b/benchmarks/models/basic.py
@@ -0,0 +1,21 @@
+"""Basic benchmark model: 2*N^2 variables and constraints."""
+
+from __future__ import annotations
+
+import linopy
+
+LABEL = "basic N={n}"
+SIZES = [{"n": n} for n in [5, 10, 25, 50, 100, 200, 500]]
+QUICK_SIZES = [{"n": n} for n in [5, 10, 25]]
+DESCRIPTION = "2*N^2 vars/cons — simple dense model"
+
+
+def build(n: int) -> linopy.Model:
+    """Build a basic N×N model."""
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="x")
+    y = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="y")
+    m.add_constraints(x + y <= 10, name="upper")
+    m.add_constraints(x - y >= -5, name="lower")
+    m.add_objective(x.sum() + 2 * y.sum())
+    return m
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
new file mode 100644
index 00000000..38c3c189
--- /dev/null
+++ b/benchmarks/models/knapsack.py
@@ -0,0 +1,26 @@
+"""Knapsack benchmark model: N binary variables."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+
+LABEL = "knapsack N={n}"
+SIZES = [{"n": n} for n in [10, 50, 100, 500, 1000]]
+QUICK_SIZES = [{"n": n} for n in [10, 50]]
+DESCRIPTION = "N binary variables — integer programming stress test"
+
+
+def build(n: int) -> linopy.Model:
+    """Build a knapsack model with N items."""
+    rng = np.random.default_rng(42)
+    weights = rng.integers(1, 100, size=n)
+    values = rng.integers(1, 100, size=n)
+    capacity = int(weights.sum() * 0.5)
+
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n)], dims=["item"], binary=True, name="x")
+    m.add_constraints((x * weights).sum() <= capacity, name="capacity")
+    m.add_objective(-(x * values).sum())
+    return m
diff --git a/benchmarks/models/large_expr.py b/benchmarks/models/large_expr.py
new file mode 100644
index 00000000..b537e541
--- /dev/null
+++ b/benchmarks/models/large_expr.py
@@ -0,0 +1,37 @@
+"""Large expression benchmark: many-term expression stress test."""
+
+from __future__ import annotations
+
+import linopy
+
+LABEL = "large_expr N={n_constraints} K={terms_per_constraint}"
+SIZES = [
+    {"n_constraints": 100, "terms_per_constraint": 10},
+    {"n_constraints": 500, "terms_per_constraint": 50},
+    {"n_constraints": 1000, "terms_per_constraint": 100},
+]
+QUICK_SIZES = [
+    {"n_constraints": 100, "terms_per_constraint": 10},
+]
+DESCRIPTION = "N constraints each summing K variables — expression building stress test"
+
+
+def build(n_constraints: int, terms_per_constraint: int) -> linopy.Model:
+    """Build a model with many-term expressions."""
+    m = linopy.Model()
+
+    # Create variables: one per (constraint, term)
+    x = m.add_variables(
+        lower=0,
+        coords=[range(n_constraints), range(terms_per_constraint)],
+        dims=["constraint", "term"],
+        name="x",
+    )
+
+    # Each constraint sums all terms for that constraint index
+    expr = x.sum("term")
+    m.add_constraints(expr <= 1, name="sum_limit")
+
+    # Objective: sum everything
+    m.add_objective(x.sum())
+    return m
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
new file mode 100644
index 00000000..64ab9754
--- /dev/null
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -0,0 +1,26 @@
+"""PyPSA SciGrid-DE benchmark model."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import linopy
+
+LABEL = "pypsa snapshots={snapshots}"
+SIZES = [{"snapshots": s} for s in [10, 50, 100, 200]]
+QUICK_SIZES = [{"snapshots": s} for s in [10, 50]]
+DESCRIPTION = "Real power system model from PyPSA SciGrid-DE"
+
+
+def build(snapshots: int = 100) -> linopy.Model | None:
+    """Build PyPSA SciGrid model. Returns None if pypsa not installed."""
+    try:
+        import pypsa
+    except ImportError:
+        return None
+
+    n = pypsa.examples.scigrid_de()
+    n.set_snapshots(n.snapshots[:snapshots])
+    n.optimize.create_model()
+    return n.model
diff --git a/benchmarks/models/sparse.py b/benchmarks/models/sparse.py
new file mode 100644
index 00000000..e73e6a54
--- /dev/null
+++ b/benchmarks/models/sparse.py
@@ -0,0 +1,61 @@
+"""Sparse topology benchmark: ring network with bus balance constraints."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+
+LABEL = "sparse N={n_buses} T={n_time}"
+SIZES = [
+    {"n_buses": 20, "n_time": 24},
+    {"n_buses": 50, "n_time": 50},
+    {"n_buses": 100, "n_time": 100},
+    {"n_buses": 200, "n_time": 200},
+]
+QUICK_SIZES = [
+    {"n_buses": 20, "n_time": 24},
+    {"n_buses": 50, "n_time": 50},
+]
+DESCRIPTION = "Sparse ring network — exercises outer-join alignment"
+
+
+def build(n_buses: int, n_time: int) -> linopy.Model:
+    """
+    Build a ring-topology network model.
+
+    N buses connected in a ring, each with generation and demand.
+    Flow variables on each line connect adjacent buses.
+    """
+    m = linopy.Model()
+
+    buses = range(n_buses)
+    time = range(n_time)
+    # Ring topology: line i connects bus i to bus (i+1) % n_buses
+    n_lines = n_buses
+    lines = range(n_lines)
+
+    gen = m.add_variables(
+        lower=0, coords=[buses, time], dims=["bus", "time"], name="gen"
+    )
+    flow = m.add_variables(coords=[lines, time], dims=["line", "time"], name="flow")
+
+    # Flow capacity
+    m.add_constraints(flow <= 100, name="flow_upper")
+    m.add_constraints(flow >= -100, name="flow_lower")
+
+    # Bus balance: gen[b] + inflow - outflow = demand[b]
+    rng = np.random.default_rng(42)
+    demand = rng.uniform(10, 50, size=(n_buses, n_time))
+
+    for b in buses:
+        # Lines into bus b: line (b-1) % n_buses flows into b
+        # Lines out of bus b: line b flows out of b
+        line_in = (b - 1) % n_buses
+        line_out = b
+        balance = gen.sel(bus=b) + flow.sel(line=line_in) - flow.sel(line=line_out)
+        m.add_constraints(balance == demand[b], name=f"balance_{b}")
+
+    # Generation cost (sum over time first, then weight by bus cost)
+    m.add_objective(gen.sum("time"))
+    return m
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100644
index 00000000..fb9046e9
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,128 @@
+"""Benchmark orchestrator — main entry point for running benchmarks."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from benchmarks.models import get_model, list_models
+from benchmarks.runners import get_runner, list_phases
+
+
+def run_single(
+    model_name: str,
+    phase: str,
+    label: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> dict:
+    """Run one model x one phase, save JSON, return results."""
+    model_mod = get_model(model_name)
+    runner = get_runner(phase)
+    sizes = (
+        model_mod.QUICK_SIZES
+        if quick and hasattr(model_mod, "QUICK_SIZES")
+        else model_mod.SIZES
+    )
+
+    results = {
+        "label": label,
+        "model": model_name,
+        "phase": phase,
+        "runs": [],
+    }
+
+    for kwargs in sizes:
+        desc = model_mod.LABEL.format(**kwargs)
+        print(f"  {desc} ... ", end="", flush=True)
+        res = runner.run(
+            label=label,
+            builder=model_mod.build,
+            builder_args=kwargs,
+            iterations=iterations,
+        )
+        if res is None:
+            print("skipped")
+            continue
+        results["runs"].append(res)
+        # Print a compact summary
+        summary_parts = []
+        for key, val in res.items():
+            if key in ("phase", "label", "params", "iterations"):
+                continue
+            if isinstance(val, float):
+                summary_parts.append(f"{key}={val:.3f}")
+            elif isinstance(val, int):
+                summary_parts.append(f"{key}={val}")
+        print(", ".join(summary_parts))
+
+    # Save
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+    filename = out_path / f"{label}_{model_name}_{phase}.json"
+    with open(filename, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"  Saved: {filename}")
+    return results
+
+
+def run_phase(
+    phase: str,
+    label: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> list[dict]:
+    """Run all models for one phase."""
+    all_results = []
+    for model_name in list_models():
+        print(f"\n[{phase}] Model: {model_name}")
+        res = run_single(
+            model_name,
+            phase,
+            label=label,
+            iterations=iterations,
+            quick=quick,
+            output_dir=output_dir,
+        )
+        all_results.append(res)
+    return all_results
+
+
+def run_all(
+    label: str = "dev",
+    iterations: int = 30,
+    quick: bool = False,
+    output_dir: str = "benchmarks/results",
+) -> list[dict]:
+    """Run all phases x all models."""
+    all_results = []
+    for phase in list_phases():
+        print(f"\n{'=' * 60}")
+        print(f"Phase: {phase}")
+        print(f"{'=' * 60}")
+        results = run_phase(
+            phase,
+            label=label,
+            iterations=iterations,
+            quick=quick,
+            output_dir=output_dir,
+        )
+        all_results.extend(results)
+    return all_results
+
+
+def list_available() -> None:
+    """Print available models and phases."""
+    print("Models:")
+    for name in list_models():
+        mod = get_model(name)
+        desc = getattr(mod, "DESCRIPTION", "")
+        print(f"  {name:20s} {desc}")
+
+    print("\nPhases:")
+    for phase in list_phases():
+        runner = get_runner(phase)
+        doc = (runner.run.__doc__ or "").strip().split("\n")[0]
+        print(f"  {phase:20s} {doc}")
diff --git a/benchmarks/runners/__init__.py b/benchmarks/runners/__init__.py
new file mode 100644
index 00000000..ae3530e3
--- /dev/null
+++ b/benchmarks/runners/__init__.py
@@ -0,0 +1,21 @@
+"""Runner registry for benchmarks."""
+
+from __future__ import annotations
+
+from benchmarks.runners import build, lp_write, memory
+
+_RUNNERS = {
+    "build": build,
+    "memory": memory,
+    "lp_write": lp_write,
+}
+
+
+def get_runner(phase: str):
+    """Return a runner module by phase name."""
+    return _RUNNERS[phase]
+
+
+def list_phases() -> list[str]:
+    """Return sorted list of available phase names."""
+    return sorted(_RUNNERS)
diff --git a/benchmarks/runners/build.py b/benchmarks/runners/build.py
new file mode 100644
index 00000000..e9436565
--- /dev/null
+++ b/benchmarks/runners/build.py
@@ -0,0 +1,57 @@
+"""Build runner: measures model construction speed."""
+
+from __future__ import annotations
+
+import gc
+import time
+
+import numpy as np
+
+PHASE = "build"
+
+
+def run(
+    label: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 30,
+    **kwargs,
+) -> dict | None:
+    """
+    Time model construction over multiple iterations.
+
+    Returns dict with median, q25, q75 build times and model stats.
+    """
+    # Warmup
+    model = builder(**builder_args)
+    if model is None:
+        return None
+    del model
+    gc.collect()
+
+    times = []
+    nvars = 0
+    ncons = 0
+
+    for _ in range(iterations):
+        gc.collect()
+        t0 = time.perf_counter()
+        model = builder(**builder_args)
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        nvars = int(getattr(model, "nvars", 0))
+        ncons = int(getattr(model, "ncons", 0))
+        del model
+
+    times_arr = np.array(times)
+    return {
+        "phase": PHASE,
+        "label": label,
+        "params": builder_args,
+        "iterations": iterations,
+        "build_time_median_s": float(np.median(times_arr)),
+        "build_time_q25_s": float(np.percentile(times_arr, 25)),
+        "build_time_q75_s": float(np.percentile(times_arr, 75)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/benchmarks/runners/lp_write.py b/benchmarks/runners/lp_write.py
new file mode 100644
index 00000000..d0ee745f
--- /dev/null
+++ b/benchmarks/runners/lp_write.py
@@ -0,0 +1,60 @@
+"""LP write runner: measures LP file writing speed."""
+
+from __future__ import annotations
+
+import gc
+import tempfile
+import time
+from pathlib import Path
+
+import numpy as np
+
+PHASE = "lp_write"
+
+
+def run(
+    label: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 10,
+    **kwargs,
+) -> dict | None:
+    """
+    Time LP file writing over multiple iterations.
+
+    Builds the model once, then times repeated LP file writes.
+    Returns dict with median, q25, q75 write times.
+    """
+    model = builder(**builder_args)
+    if model is None:
+        return None
+
+    nvars = int(getattr(model, "nvars", 0))
+    ncons = int(getattr(model, "ncons", 0))
+
+    times = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        lp_path = Path(tmpdir) / "model.lp"
+
+        # Warmup
+        model.to_file(lp_path)
+
+        for _ in range(iterations):
+            gc.collect()
+            t0 = time.perf_counter()
+            model.to_file(lp_path)
+            elapsed = time.perf_counter() - t0
+            times.append(elapsed)
+
+    times_arr = np.array(times)
+    return {
+        "phase": PHASE,
+        "label": label,
+        "params": builder_args,
+        "iterations": iterations,
+        "write_time_median_s": float(np.median(times_arr)),
+        "write_time_q25_s": float(np.percentile(times_arr, 25)),
+        "write_time_q75_s": float(np.percentile(times_arr, 75)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/benchmarks/runners/memory.py b/benchmarks/runners/memory.py
new file mode 100644
index 00000000..15526684
--- /dev/null
+++ b/benchmarks/runners/memory.py
@@ -0,0 +1,70 @@
+"""Memory runner: measures peak memory during model construction."""
+
+from __future__ import annotations
+
+import gc
+import tracemalloc
+
+import numpy as np
+
+PHASE = "memory"
+
+
+def run(
+    label: str,
+    builder,
+    builder_args: dict,
+    iterations: int = 5,
+    **kwargs,
+) -> dict | None:
+    """
+    Measure peak memory via tracemalloc over multiple iterations.
+
+    Uses fewer iterations by default since memory measurement is slower.
+    Returns dict with median/max peak memory and model stats.
+    """
+    # Warmup
+    model = builder(**builder_args)
+    if model is None:
+        return None
+    del model
+    gc.collect()
+
+    peaks = []
+    nvars = 0
+    ncons = 0
+
+    for _ in range(iterations):
+        gc.collect()
+        if tracemalloc.is_tracing():
+            tracemalloc.stop()
+        tracemalloc.start()
+        tracemalloc.reset_peak()
+
+        model = builder(**builder_args)
+
+        _, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        if model is None:
+            continue
+
+        nvars = int(getattr(model, "nvars", 0))
+        ncons = int(getattr(model, "ncons", 0))
+        peaks.append(peak / 1e6)  # bytes to MB
+        del model
+
+    if not peaks:
+        return None
+
+    peaks_arr = np.array(peaks)
+    return {
+        "phase": PHASE,
+        "label": label,
+        "params": builder_args,
+        "iterations": iterations,
+        "peak_memory_median_mb": float(np.median(peaks_arr)),
+        "peak_memory_max_mb": float(np.max(peaks_arr)),
+        "nvars": nvars,
+        "ncons": ncons,
+    }
diff --git a/justfile b/justfile
new file mode 100644
index 00000000..1f06e3c6
--- /dev/null
+++ b/justfile
@@ -0,0 +1,34 @@
+default_iterations := "30"
+results_dir := "benchmarks/results"
+
+# Run all phases for all models
+bench label="dev" iterations=default_iterations:
+    python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+
+# Benchmark build phase only
+bench-build label="dev" iterations=default_iterations:
+    python -c "from benchmarks.run import run_phase; run_phase('build', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+
+# Benchmark memory phase only
+bench-memory label="dev":
+    python -c "from benchmarks.run import run_phase; run_phase('memory', label='{{label}}', output_dir='{{results_dir}}')"
+
+# Benchmark LP write phase only
+bench-write label="dev" iterations=default_iterations:
+    python -c "from benchmarks.run import run_phase; run_phase('lp_write', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+
+# Run a single model + phase
+bench-model model phase="memory" label="dev" iterations=default_iterations:
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+
+# Quick smoke test (small sizes, few iterations)
+bench-quick label="dev":
+    python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
+
+# Compare two result JSON files
+bench-compare old new:
+    python -c "from benchmarks.compare import compare; compare('{{old}}', '{{new}}')"
+
+# List available models and phases
+bench-list:
+    python -c "from benchmarks.run import list_available; list_available()"

From 6884ec2e63a0890aad3624d4fb85daf39d8ff7c0 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:46:17 +0100
Subject: [PATCH 02/27] Exclude from build

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 52d5e3d5..e6e0ac7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -99,7 +99,7 @@ version_scheme = "no-guess-dev"
 
 [tool.pytest.ini_options]
 testpaths = ["test"]
-norecursedirs = ["dev-scripts", "doc", "examples", "benchmark"]
+norecursedirs = ["dev-scripts", "doc", "examples", "benchmark", "benchmarks"]
 markers = [
     "gpu: marks tests as requiring GPU hardware (deselect with '-m \"not gpu\"')",
 ]
@@ -112,7 +112,7 @@ omit = ["test/*"]
 exclude_also = ["if TYPE_CHECKING:"]
 
 [tool.mypy]
-exclude = ['dev/*', 'examples/*', 'benchmark/*', 'doc/*']
+exclude = ['dev/*', 'examples/*', 'benchmark/*', 'benchmarks/*', 'doc/*']
 ignore_missing_imports = true
 no_implicit_optional = true
 warn_unused_ignores = true

From 629f9dcf0b2309117d9d81e1b4c7eaf8a545bfd4 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:49:32 +0100
Subject: [PATCH 03/27] Improve plotting

---
 benchmarks/compare.py | 231 +++++++++++++++++++++++++++++++-----------
 justfile              |   6 +-
 2 files changed, 173 insertions(+), 64 deletions(-)

diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index 4dbe56ec..ba5d5228 100644
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -1,95 +1,204 @@
-"""Compare two benchmark result JSON files and produce a plot."""
+"""Compare benchmark results across branches and produce plots."""
 
 from __future__ import annotations
 
 import json
 from pathlib import Path
 
+# Primary metric per phase
+METRIC_KEYS = {
+    "build": "build_time_median_s",
+    "memory": "peak_memory_median_mb",
+    "lp_write": "write_time_median_s",
+}
+
+# IQR band keys per phase (lower, upper)
+IQR_KEYS = {
+    "build": ("build_time_q25_s", "build_time_q75_s"),
+    "memory": None,
+    "lp_write": ("write_time_q25_s", "write_time_q75_s"),
+}
+
+COLORS = ["#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02"]
+MARKERS = ["o", "s", "D", "^", "v", "P"]
+
+
+def _load(path: str) -> dict:
+    with open(path) as f:
+        data = json.load(f)
+    data.setdefault("label", Path(path).stem)
+    return data
+
+
+def _detect_metric(phase: str, runs: list[dict]) -> str | None:
+    metric = METRIC_KEYS.get(phase)
+    if metric and runs and metric in runs[0]:
+        return metric
+    # Fallback: first key containing "median"
+    if runs:
+        for key in runs[0]:
+            if "median" in key:
+                return key
+    return None
+
+
+def _size_label(params: dict) -> str:
+    """Short human-readable label from params dict."""
+    parts = [f"{k}={v}" for k, v in params.items()]
+    return ", ".join(parts)
 
-def compare(old_path: str, new_path: str) -> None:
-    """Load two result JSONs and produce a comparison PNG."""
-    import matplotlib.pyplot as plt
 
-    with open(old_path) as f:
-        old = json.load(f)
-    with open(new_path) as f:
-        new = json.load(f)
+def _x_value(params: dict) -> float:
+    """Extract a numeric x-axis value from params (use product of all values)."""
+    vals = [v for v in params.values() if isinstance(v, int | float)]
+    result = 1
+    for v in vals:
+        result *= v
+    return float(result)
 
-    old_label = old.get("label", Path(old_path).stem)
-    new_label = new.get("label", Path(new_path).stem)
-    phase = old.get("phase", "unknown")
-    model_name = old.get("model", "unknown")
 
-    old_runs = old.get("runs", [])
-    new_runs = new.get("runs", [])
+def compare(*paths: str) -> None:
+    """
+    Compare any number of result JSONs for the same model×phase.
 
-    if not old_runs or not new_runs:
-        print("No runs to compare.")
+    Produces a 2-panel plot:
+      Left:  absolute metric vs model size, one line per branch
+      Right: ratio vs first file (baseline), one line per subsequent branch
+
+    Args:
+        *paths: Two or more paths to benchmark JSON files.
+    """
+    if len(paths) < 2:
+        print("Need at least 2 files to compare.")
         return
 
-    # Find the primary metric based on phase
-    metric_keys = {
-        "build": "build_time_median_s",
-        "memory": "peak_memory_median_mb",
-        "lp_write": "write_time_median_s",
-    }
-    metric = metric_keys.get(phase)
-    if metric is None:
-        # Try to auto-detect
-        for key in old_runs[0]:
-            if "median" in key:
-                metric = key
-                break
+    import matplotlib.pyplot as plt
+
+    datasets = [_load(p) for p in paths]
+    phase = datasets[0].get("phase", "unknown")
+    model_name = datasets[0].get("model", "unknown")
+
+    # Validate all files are the same model×phase
+    for d in datasets[1:]:
+        if d.get("model") != model_name or d.get("phase") != phase:
+            print(
+                f"Warning: mixing model/phase — "
+                f"expected {model_name}/{phase}, "
+                f"got {d.get('model')}/{d.get('phase')}"
+            )
+
+    metric = _detect_metric(phase, datasets[0].get("runs", []))
     if metric is None:
         print("Cannot determine metric to plot.")
         return
 
-    c_old, c_new = "#1b9e77", "#d95f02"
+    iqr = IQR_KEYS.get(phase)
 
-    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
 
-    # Panel 1: Absolute values
+    # --- Panel 1: Absolute metric vs size ---
     ax = axes[0]
-    x_old = list(range(len(old_runs)))
-    x_new = list(range(len(new_runs)))
-    y_old = [r.get(metric, 0) for r in old_runs]
-    y_new = [r.get(metric, 0) for r in new_runs]
-    labels_old = [str(r.get("params", {})) for r in old_runs]
-
-    ax.plot(x_old, y_old, "o-", color=c_old, label=old_label, linewidth=2, markersize=8)
-    ax.plot(
-        x_new, y_new, "s--", color=c_new, label=new_label, linewidth=2, markersize=8
-    )
-    ax.set_xticks(x_old)
-    ax.set_xticklabels(labels_old, rotation=45, ha="right", fontsize=7)
+    all_x_labels = []
+    for i, data in enumerate(datasets):
+        runs = data.get("runs", [])
+        if not runs:
+            continue
+        color = COLORS[i % len(COLORS)]
+        marker = MARKERS[i % len(MARKERS)]
+        xs = list(range(len(runs)))
+        ys = [r.get(metric, 0) for r in runs]
+
+        if i == 0:
+            all_x_labels = [_size_label(r.get("params", {})) for r in runs]
+
+        ax.plot(
+            xs,
+            ys,
+            marker=marker,
+            color=color,
+            linewidth=2,
+            markersize=7,
+            alpha=0.85,
+            label=data["label"],
+        )
+
+        # IQR band if available
+        if iqr and runs[0].get(iqr[0]) is not None:
+            lo = [r.get(iqr[0], 0) for r in runs]
+            hi = [r.get(iqr[1], 0) for r in runs]
+            ax.fill_between(xs, lo, hi, color=color, alpha=0.15)
+
+    ax.set_xticks(range(len(all_x_labels)))
+    ax.set_xticklabels(all_x_labels, rotation=45, ha="right", fontsize=7)
     ax.set_ylabel(metric)
-    ax.set_title(f"{model_name} / {phase}: {metric}")
-    ax.legend()
+    ax.set_title(f"{model_name} / {phase}")
+    ax.legend(fontsize=9)
     ax.grid(True, alpha=0.3)
 
-    # Panel 2: Ratio (new / old)
+    # --- Panel 2: Ratio vs baseline (first file) ---
     ax = axes[1]
-    n_compare = min(len(old_runs), len(new_runs))
-    ratios = []
-    for i in range(n_compare):
-        vo = old_runs[i].get(metric, 0)
-        vn = new_runs[i].get(metric, 0)
-        ratios.append(vn / vo if vo > 0 else float("nan"))
-
-    ax.bar(range(n_compare), ratios, color=c_new, alpha=0.7)
+    baseline_runs = datasets[0].get("runs", [])
+    baseline_by_params = {
+        json.dumps(r["params"], sort_keys=True): r for r in baseline_runs
+    }
+
+    for i, data in enumerate(datasets[1:], 1):
+        runs = data.get("runs", [])
+        if not runs:
+            continue
+        color = COLORS[i % len(COLORS)]
+        marker = MARKERS[i % len(MARKERS)]
+
+        xs, ys, annots = [], [], []
+        for j, r in enumerate(runs):
+            key = json.dumps(r["params"], sort_keys=True)
+            base = baseline_by_params.get(key)
+            if base is None:
+                continue
+            base_val = base.get(metric, 0)
+            cur_val = r.get(metric, 0)
+            ratio = cur_val / base_val if base_val > 0 else float("nan")
+            xs.append(j)
+            ys.append(ratio)
+            annots.append(f"{ratio:.2f}")
+
+        ax.plot(
+            xs,
+            ys,
+            marker=marker,
+            color=color,
+            linewidth=2,
+            markersize=7,
+            alpha=0.85,
+            label=data["label"],
+        )
+        for x, y, txt in zip(xs, ys, annots):
+            ax.annotate(
+                txt,
+                (x, y),
+                textcoords="offset points",
+                xytext=(0, 10),
+                ha="center",
+                fontsize=8,
+                color=color,
+            )
+
     ax.axhline(1.0, color="k", linestyle="--", linewidth=1.5, alpha=0.6)
-    ax.set_xticks(range(n_compare))
-    ax.set_xticklabels(labels_old[:n_compare], rotation=45, ha="right", fontsize=7)
-    ax.set_ylabel(f"Ratio ({new_label} / {old_label})")
-    ax.set_title("Relative performance")
+    ax.set_xticks(range(len(all_x_labels)))
+    ax.set_xticklabels(all_x_labels, rotation=45, ha="right", fontsize=7)
+    ax.set_ylabel(f"Ratio (vs {datasets[0]['label']})")
+    ax.set_title("Relative to baseline")
+    ax.legend(fontsize=9)
     ax.grid(True, alpha=0.3)
 
     fig.suptitle(
-        f"Benchmark Comparison: {model_name} / {phase}", fontsize=13, fontweight="bold"
+        f"Benchmark: {model_name} / {phase}",
+        fontsize=13,
+        fontweight="bold",
     )
     fig.tight_layout()
 
-    out_png = Path(old_path).parent / f"compare_{model_name}_{phase}.png"
+    out_png = Path(paths[0]).parent / f"compare_{model_name}_{phase}.png"
     plt.savefig(out_png, dpi=150)
     print(f"Saved: {out_png}")
     plt.close()
diff --git a/justfile b/justfile
index 1f06e3c6..6a2dd570 100644
--- a/justfile
+++ b/justfile
@@ -25,9 +25,9 @@ bench-model model phase="memory" label="dev" iterations=default_iterations:
 bench-quick label="dev":
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
 
-# Compare two result JSON files
-bench-compare old new:
-    python -c "from benchmarks.compare import compare; compare('{{old}}', '{{new}}')"
+# Compare result JSON files across branches (2 or more)
+bench-compare +files:
+    python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
 # List available models and phases
 bench-list:

From cc40cd8e8664632852a22df0e453fe29d40f41b1 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:53:00 +0100
Subject: [PATCH 04/27] Remove pypsa from bench-quick

---
 justfile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/justfile b/justfile
index 6a2dd570..f2b007a7 100644
--- a/justfile
+++ b/justfile
@@ -21,9 +21,11 @@ bench-write label="dev" iterations=default_iterations:
 bench-model model phase="memory" label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
-# Quick smoke test (small sizes, few iterations)
+# Quick smoke test (basic model only, small sizes)
 bench-quick label="dev":
-    python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
 
 # Compare result JSON files across branches (2 or more)
 bench-compare +files:

From b825c801b819814a7a69a37f72c4d495084d5a57 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:57:46 +0100
Subject: [PATCH 05/27] Add command for benchmakr comparison

---
 justfile | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/justfile b/justfile
index f2b007a7..1731dedf 100644
--- a/justfile
+++ b/justfile
@@ -27,6 +27,52 @@ bench-quick label="dev":
     python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
 
+# Benchmark a remote/local branch (checks it out, runs, returns)
+# Usage: just bench-branch FBumann:perf/lp-write-speed-combined build
+#        just bench-branch origin/master memory
+#        just bench-branch my-local-branch lp_write
+bench-branch ref phase="build" model="basic" iterations=default_iterations:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    home_branch=$(git rev-parse --abbrev-ref HEAD)
+    tmp_bench=$(mktemp -d)
+    # Preserve benchmarks/ and results/ across checkout
+    cp -r benchmarks/ "$tmp_bench/benchmarks"
+    # Sanitize label: replace / and : with -
+    label=$(echo "{{ref}}" | tr '/:' '--')
+    # Handle remote refs like "FBumann:perf/lp-write-speed-combined"
+    ref="{{ref}}"
+    if [[ "$ref" == *:* ]]; then
+        remote="${ref%%:*}"
+        branch="${ref#*:}"
+        # Add remote if not present, fetch the branch
+        git remote get-url "$remote" 2>/dev/null || git remote add "$remote" "https://github.com/$remote/linopy.git"
+        git fetch "$remote" "$branch"
+        checkout_ref="$remote/$branch"
+    else
+        # Local branch or origin ref
+        git fetch --all 2>/dev/null || true
+        checkout_ref="$ref"
+    fi
+    echo ">>> Checking out $checkout_ref ..."
+    git checkout "$checkout_ref" --detach
+    # Restore benchmarks/
+    cp -r "$tmp_bench/benchmarks" benchmarks/
+    # Install the checked-out linopy
+    pip install -e . --quiet 2>/dev/null || true
+    echo ">>> Running: model={{model}} phase={{phase}} label=$label ..."
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='$label', iterations={{iterations}}, output_dir='{{results_dir}}')"
+    # Save results before switching back
+    cp -r benchmarks/results/ "$tmp_bench/results"
+    echo ">>> Returning to $home_branch ..."
+    git checkout "$home_branch"
+    # Restore results from the run
+    cp -r "$tmp_bench/results/"* benchmarks/results/ 2>/dev/null || true
+    rm -rf "$tmp_bench"
+    # Reinstall current branch
+    pip install -e . --quiet 2>/dev/null || true
+    echo ">>> Done. Results saved with label=$label"
+
 # Compare result JSON files across branches (2 or more)
 bench-compare +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}

From 594345b450c00a2f3bf54ae393c286bf84e1ab35 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:54:40 +0100
Subject: [PATCH 06/27] Update justfile

---
 justfile | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/justfile b/justfile
index 1731dedf..78c86acc 100644
--- a/justfile
+++ b/justfile
@@ -27,51 +27,60 @@ bench-quick label="dev":
     python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
 
-# Benchmark a remote/local branch (checks it out, runs, returns)
-# Usage: just bench-branch FBumann:perf/lp-write-speed-combined build
-#        just bench-branch origin/master memory
-#        just bench-branch my-local-branch lp_write
-bench-branch ref phase="build" model="basic" iterations=default_iterations:
+# Benchmark a branch vs current: checkout ref, run bench-quick, return, run bench-quick here, compare
+# Usage: just bench-branch FBumann:perf/lp-write-speed-combined
+#        just bench-branch origin/master
+#        just bench-branch my-local-branch
+bench-branch ref:
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
+    home_label=$(echo "$home_branch" | tr '/:' '--')
     tmp_bench=$(mktemp -d)
     # Preserve benchmarks/ and results/ across checkout
     cp -r benchmarks/ "$tmp_bench/benchmarks"
     # Sanitize label: replace / and : with -
-    label=$(echo "{{ref}}" | tr '/:' '--')
+    ref_label=$(echo "{{ref}}" | tr '/:' '--')
     # Handle remote refs like "FBumann:perf/lp-write-speed-combined"
     ref="{{ref}}"
     if [[ "$ref" == *:* ]]; then
         remote="${ref%%:*}"
         branch="${ref#*:}"
-        # Add remote if not present, fetch the branch
         git remote get-url "$remote" 2>/dev/null || git remote add "$remote" "https://github.com/$remote/linopy.git"
-        git fetch "$remote" "$branch"
-        checkout_ref="$remote/$branch"
+        git fetch "$remote" "$branch" --no-tags --no-recurse-submodules 2>&1 || true
+        checkout_ref="FETCH_HEAD"
     else
-        # Local branch or origin ref
-        git fetch --all 2>/dev/null || true
+        git fetch origin --no-tags --no-recurse-submodules 2>&1 || true
         checkout_ref="$ref"
     fi
     echo ">>> Checking out $checkout_ref ..."
     git checkout "$checkout_ref" --detach
-    # Restore benchmarks/
     cp -r "$tmp_bench/benchmarks" benchmarks/
-    # Install the checked-out linopy
     pip install -e . --quiet 2>/dev/null || true
-    echo ">>> Running: model={{model}} phase={{phase}} label=$label ..."
-    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='$label', iterations={{iterations}}, output_dir='{{results_dir}}')"
+    echo ">>> Running bench-quick on $ref_label ..."
+    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     # Save results before switching back
     cp -r benchmarks/results/ "$tmp_bench/results"
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
-    # Restore results from the run
     cp -r "$tmp_bench/results/"* benchmarks/results/ 2>/dev/null || true
     rm -rf "$tmp_bench"
-    # Reinstall current branch
     pip install -e . --quiet 2>/dev/null || true
-    echo ">>> Done. Results saved with label=$label"
+    echo ">>> Running bench-quick on $home_label ..."
+    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+    echo ">>> Comparing results ..."
+    for phase in build memory lp_write; do
+        old="benchmarks/results/${ref_label}_basic_${phase}.json"
+        new="benchmarks/results/${home_label}_basic_${phase}.json"
+        if [[ -f "$old" && -f "$new" ]]; then
+            python -c "from benchmarks.compare import compare; compare('$old', '$new')"
+        fi
+    done
+    echo ">>> Done."
 
 # Compare result JSON files across branches (2 or more)
 bench-compare +files:

From d78c256822ce1092ebad16a72d1102e0e66b6054 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:56:06 +0100
Subject: [PATCH 07/27] Update justfile

---
 justfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/justfile b/justfile
index 78c86acc..0a6cbfa8 100644
--- a/justfile
+++ b/justfile
@@ -54,7 +54,7 @@ bench-branch ref:
         checkout_ref="$ref"
     fi
     echo ">>> Checking out $checkout_ref ..."
-    git checkout "$checkout_ref" --detach
+    git checkout --detach "$checkout_ref"
     cp -r "$tmp_bench/benchmarks" benchmarks/
     pip install -e . --quiet 2>/dev/null || true
     echo ">>> Running bench-quick on $ref_label ..."

From 7b5625056deaeba151a061f39bb7a81759f32bd5 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:58:10 +0100
Subject: [PATCH 08/27] Update justfile

---
 justfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/justfile b/justfile
index 0a6cbfa8..20b084ec 100644
--- a/justfile
+++ b/justfile
@@ -38,7 +38,7 @@ bench-branch ref:
     home_label=$(echo "$home_branch" | tr '/:' '--')
     tmp_bench=$(mktemp -d)
     # Preserve benchmarks/ and results/ across checkout
-    cp -r benchmarks/ "$tmp_bench/benchmarks"
+    cp -r benchmarks "$tmp_bench/benchmarks"
     # Sanitize label: replace / and : with -
     ref_label=$(echo "{{ref}}" | tr '/:' '--')
     # Handle remote refs like "FBumann:perf/lp-write-speed-combined"
@@ -55,14 +55,15 @@ bench-branch ref:
     fi
     echo ">>> Checking out $checkout_ref ..."
     git checkout --detach "$checkout_ref"
-    cp -r "$tmp_bench/benchmarks" benchmarks/
+    rm -rf benchmarks/
+    cp -r "$tmp_bench/benchmarks" benchmarks
     pip install -e . --quiet 2>/dev/null || true
     echo ">>> Running bench-quick on $ref_label ..."
     python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     # Save results before switching back
-    cp -r benchmarks/results/ "$tmp_bench/results"
+    cp -r benchmarks/results "$tmp_bench/results"
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
     cp -r "$tmp_bench/results/"* benchmarks/results/ 2>/dev/null || true

From aa854758e17964d52aed4ca18ca4d1e1c589d155 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:59:26 +0100
Subject: [PATCH 09/27] Update justfile

---
 justfile | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/justfile b/justfile
index 20b084ec..1b7b1357 100644
--- a/justfile
+++ b/justfile
@@ -36,12 +36,7 @@ bench-branch ref:
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
     home_label=$(echo "$home_branch" | tr '/:' '--')
-    tmp_bench=$(mktemp -d)
-    # Preserve benchmarks/ and results/ across checkout
-    cp -r benchmarks "$tmp_bench/benchmarks"
-    # Sanitize label: replace / and : with -
     ref_label=$(echo "{{ref}}" | tr '/:' '--')
-    # Handle remote refs like "FBumann:perf/lp-write-speed-combined"
     ref="{{ref}}"
     if [[ "$ref" == *:* ]]; then
         remote="${ref%%:*}"
@@ -55,19 +50,13 @@ bench-branch ref:
     fi
     echo ">>> Checking out $checkout_ref ..."
     git checkout --detach "$checkout_ref"
-    rm -rf benchmarks/
-    cp -r "$tmp_bench/benchmarks" benchmarks
     pip install -e . --quiet 2>/dev/null || true
     echo ">>> Running bench-quick on $ref_label ..."
     python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
     python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
-    # Save results before switching back
-    cp -r benchmarks/results "$tmp_bench/results"
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
-    cp -r "$tmp_bench/results/"* benchmarks/results/ 2>/dev/null || true
-    rm -rf "$tmp_bench"
     pip install -e . --quiet 2>/dev/null || true
     echo ">>> Running bench-quick on $home_label ..."
     python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"

From 8e132a04766e5cabf90bb1790f72a25dd4b68d95 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:07:01 +0100
Subject: [PATCH 10/27] Update justfile

---
 justfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/justfile b/justfile
index 1b7b1357..aeae1ac4 100644
--- a/justfile
+++ b/justfile
@@ -42,7 +42,7 @@ bench-branch ref:
         remote="${ref%%:*}"
         branch="${ref#*:}"
         git remote get-url "$remote" 2>/dev/null || git remote add "$remote" "https://github.com/$remote/linopy.git"
-        git fetch "$remote" "$branch" --no-tags --no-recurse-submodules 2>&1 || true
+        git fetch "$remote" "$branch" --no-tags --no-recurse-submodules
         checkout_ref="FETCH_HEAD"
     else
         git fetch origin --no-tags --no-recurse-submodules 2>&1 || true

From 42bf1152c2aef26560525b6e520cec19a64f0bdd Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:12:01 +0100
Subject: [PATCH 11/27] Update the plotting

---
 benchmarks/compare.py | 277 +++++++++++++++++++++++-------------------
 1 file changed, 151 insertions(+), 126 deletions(-)

diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index ba5d5228..c22fe160 100644
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -5,21 +5,23 @@
 import json
 from pathlib import Path
 
-# Primary metric per phase
-METRIC_KEYS = {
-    "build": "build_time_median_s",
-    "memory": "peak_memory_median_mb",
-    "lp_write": "write_time_median_s",
+# Metric keys per phase: (median, q25, q75)
+METRIC_KEYS: dict[str, tuple[str, str, str]] = {
+    "build": ("build_time_median_s", "build_time_q25_s", "build_time_q75_s"),
+    "memory": ("peak_memory_median_mb", "peak_memory_median_mb", "peak_memory_max_mb"),
+    "lp_write": ("write_time_median_s", "write_time_q25_s", "write_time_q75_s"),
 }
 
-# IQR band keys per phase (lower, upper)
-IQR_KEYS = {
-    "build": ("build_time_q25_s", "build_time_q75_s"),
-    "memory": None,
-    "lp_write": ("write_time_q25_s", "write_time_q75_s"),
+METRIC_UNITS: dict[str, str] = {
+    "build": "Build time (ms)",
+    "memory": "Peak memory (MB)",
+    "lp_write": "Write time (ms)",
 }
 
-COLORS = ["#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", "#e6ab02"]
+# Phases where raw values are seconds → display in ms
+MS_PHASES = {"build", "lp_write"}
+
+COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b"]
 MARKERS = ["o", "s", "D", "^", "v", "P"]
 
 
@@ -30,43 +32,39 @@ def _load(path: str) -> dict:
     return data
 
 
-def _detect_metric(phase: str, runs: list[dict]) -> str | None:
-    metric = METRIC_KEYS.get(phase)
-    if metric and runs and metric in runs[0]:
-        return metric
-    # Fallback: first key containing "median"
-    if runs:
-        for key in runs[0]:
-            if "median" in key:
-                return key
-    return None
+def _extract(
+    runs: list[dict], phase: str
+) -> tuple[list[int], list[float], list[float], list[float]]:
+    """Extract nvars, median, lo, hi from runs. Convert to ms where needed."""
+    keys = METRIC_KEYS.get(phase)
+    if not keys or not runs:
+        return [], [], [], []
 
+    med_key, lo_key, hi_key = keys
+    scale = 1000.0 if phase in MS_PHASES else 1.0
 
-def _size_label(params: dict) -> str:
-    """Short human-readable label from params dict."""
-    parts = [f"{k}={v}" for k, v in params.items()]
-    return ", ".join(parts)
+    nvars = [r["nvars"] for r in runs]
+    med = [r[med_key] * scale for r in runs]
+    lo = [r.get(lo_key, r[med_key]) * scale for r in runs]
+    hi = [r.get(hi_key, r[med_key]) * scale for r in runs]
+    return nvars, med, lo, hi
 
 
-def _x_value(params: dict) -> float:
-    """Extract a numeric x-axis value from params (use product of all values)."""
-    vals = [v for v in params.values() if isinstance(v, int | float)]
-    result = 1
-    for v in vals:
-        result *= v
-    return float(result)
+def _plot_errorbar(ax, nvars, med, lo, hi, **kwargs):
+    yerr_lo = [m - l for m, l in zip(med, lo)]
+    yerr_hi = [h - m for m, h in zip(med, hi)]
+    ax.errorbar(nvars, med, yerr=[yerr_lo, yerr_hi], capsize=3, **kwargs)
 
 
 def compare(*paths: str) -> None:
     """
-    Compare any number of result JSONs for the same model×phase.
-
-    Produces a 2-panel plot:
-      Left:  absolute metric vs model size, one line per branch
-      Right: ratio vs first file (baseline), one line per subsequent branch
+    Compare any number of result JSONs for the same model x phase.
 
-    Args:
-        *paths: Two or more paths to benchmark JSON files.
+    Produces a 4-panel plot:
+      Top-left:     Log-log overview with error bars
+      Top-right:    Speedup ratio vs baseline with uncertainty bounds
+      Bottom-left:  Small models (linear scale)
+      Bottom-right: Large models (log scale)
     """
     if len(paths) < 2:
         print("Need at least 2 files to compare.")
@@ -77,8 +75,8 @@ def compare(*paths: str) -> None:
     datasets = [_load(p) for p in paths]
     phase = datasets[0].get("phase", "unknown")
     model_name = datasets[0].get("model", "unknown")
+    ylabel = METRIC_UNITS.get(phase, phase)
 
-    # Validate all files are the same model×phase
     for d in datasets[1:]:
         if d.get("model") != model_name or d.get("phase") != phase:
             print(
@@ -87,118 +85,145 @@ def compare(*paths: str) -> None:
                 f"got {d.get('model')}/{d.get('phase')}"
             )
 
-    metric = _detect_metric(phase, datasets[0].get("runs", []))
-    if metric is None:
-        print("Cannot determine metric to plot.")
-        return
+    # Extract stats for each dataset
+    all_stats = []
+    for d in datasets:
+        nvars, med, lo, hi = _extract(d.get("runs", []), phase)
+        all_stats.append((d["label"], nvars, med, lo, hi))
 
-    iqr = IQR_KEYS.get(phase)
+    if not all_stats[0][1]:
+        print("No data to plot.")
+        return
 
-    fig, axes = plt.subplots(1, 2, figsize=(14, 5.5))
+    labels = [s[0] for s in all_stats]
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    fig.suptitle(
+        f"Benchmark: {model_name} / {phase}\n{' vs '.join(labels)}",
+        fontsize=14,
+    )
 
-    # --- Panel 1: Absolute metric vs size ---
-    ax = axes[0]
-    all_x_labels = []
-    for i, data in enumerate(datasets):
-        runs = data.get("runs", [])
-        if not runs:
-            continue
+    # --- Panel 1: All data, log-log ---
+    ax = axes[0, 0]
+    for i, (label, nvars, med, lo, hi) in enumerate(all_stats):
         color = COLORS[i % len(COLORS)]
         marker = MARKERS[i % len(MARKERS)]
-        xs = list(range(len(runs)))
-        ys = [r.get(metric, 0) for r in runs]
-
-        if i == 0:
-            all_x_labels = [_size_label(r.get("params", {})) for r in runs]
-
-        ax.plot(
-            xs,
-            ys,
+        ls = "--" if i == 0 else "-"
+        _plot_errorbar(
+            ax,
+            nvars,
+            med,
+            lo,
+            hi,
             marker=marker,
             color=color,
-            linewidth=2,
-            markersize=7,
-            alpha=0.85,
-            label=data["label"],
+            linestyle=ls,
+            label=label,
+            alpha=0.8,
         )
-
-        # IQR band if available
-        if iqr and runs[0].get(iqr[0]) is not None:
-            lo = [r.get(iqr[0], 0) for r in runs]
-            hi = [r.get(iqr[1], 0) for r in runs]
-            ax.fill_between(xs, lo, hi, color=color, alpha=0.15)
-
-    ax.set_xticks(range(len(all_x_labels)))
-    ax.set_xticklabels(all_x_labels, rotation=45, ha="right", fontsize=7)
-    ax.set_ylabel(metric)
-    ax.set_title(f"{model_name} / {phase}")
+    ax.set_xscale("log")
+    ax.set_yscale("log")
+    ax.set_xlabel("Number of variables")
+    ax.set_ylabel(ylabel)
+    ax.set_title("Overview (log-log)")
     ax.legend(fontsize=9)
     ax.grid(True, alpha=0.3)
 
-    # --- Panel 2: Ratio vs baseline (first file) ---
-    ax = axes[1]
-    baseline_runs = datasets[0].get("runs", [])
-    baseline_by_params = {
-        json.dumps(r["params"], sort_keys=True): r for r in baseline_runs
-    }
-
-    for i, data in enumerate(datasets[1:], 1):
-        runs = data.get("runs", [])
-        if not runs:
+    # --- Panel 2: Speedup ratio with uncertainty bounds ---
+    ax = axes[0, 1]
+    base_label, base_nv, base_med, base_lo, base_hi = all_stats[0]
+    for i, (label, nvars, med, lo, hi) in enumerate(all_stats[1:], 1):
+        if len(nvars) != len(base_nv):
             continue
         color = COLORS[i % len(COLORS)]
-        marker = MARKERS[i % len(MARKERS)]
-
-        xs, ys, annots = [], [], []
-        for j, r in enumerate(runs):
-            key = json.dumps(r["params"], sort_keys=True)
-            base = baseline_by_params.get(key)
-            if base is None:
-                continue
-            base_val = base.get(metric, 0)
-            cur_val = r.get(metric, 0)
-            ratio = cur_val / base_val if base_val > 0 else float("nan")
-            xs.append(j)
-            ys.append(ratio)
-            annots.append(f"{ratio:.2f}")
-
-        ax.plot(
-            xs,
-            ys,
-            marker=marker,
+        # Ratio: baseline / current (>1 means current is faster)
+        ratio = [b / c if c > 0 else float("nan") for b, c in zip(base_med, med)]
+        # Uncertainty: best = base_hi/lo_cur, worst = base_lo/hi_cur
+        ratio_lo = [bl / ch if ch > 0 else float("nan") for bl, ch in zip(base_lo, hi)]
+        ratio_hi = [bh / cl if cl > 0 else float("nan") for bh, cl in zip(base_hi, lo)]
+        yerr_lo = [r - rl for r, rl in zip(ratio, ratio_lo)]
+        yerr_hi = [rh - r for r, rh in zip(ratio, ratio_hi)]
+        ax.errorbar(
+            nvars,
+            ratio,
+            yerr=[yerr_lo, yerr_hi],
+            marker=MARKERS[i % len(MARKERS)],
             color=color,
-            linewidth=2,
-            markersize=7,
-            alpha=0.85,
-            label=data["label"],
+            capsize=3,
+            label=label,
         )
-        for x, y, txt in zip(xs, ys, annots):
+        ax.fill_between(nvars, ratio_lo, ratio_hi, alpha=0.15, color=color)
+        for x, r in zip(nvars, ratio):
             ax.annotate(
-                txt,
-                (x, y),
+                f"{r:.2f}",
+                (x, r),
                 textcoords="offset points",
                 xytext=(0, 10),
                 ha="center",
                 fontsize=8,
                 color=color,
             )
-
-    ax.axhline(1.0, color="k", linestyle="--", linewidth=1.5, alpha=0.6)
-    ax.set_xticks(range(len(all_x_labels)))
-    ax.set_xticklabels(all_x_labels, rotation=45, ha="right", fontsize=7)
-    ax.set_ylabel(f"Ratio (vs {datasets[0]['label']})")
-    ax.set_title("Relative to baseline")
+    ax.axhline(1.0, color="gray", linestyle="--", alpha=0.5)
+    ax.set_xscale("log")
+    ax.set_xlabel("Number of variables")
+    ax.set_ylabel(f"Speedup ({base_label} / other)")
+    ax.set_title("Relative performance")
     ax.legend(fontsize=9)
     ax.grid(True, alpha=0.3)
 
-    fig.suptitle(
-        f"Benchmark: {model_name} / {phase}",
-        fontsize=13,
-        fontweight="bold",
-    )
-    fig.tight_layout()
+    # --- Panels 3 & 4: Small vs large models ---
+    cutoff = 25000
+
+    for panel_idx, (title, filt, use_log) in enumerate(
+        [
+            (f"Small models (≤ {cutoff:,} vars)", lambda n: n <= cutoff, False),
+            (f"Large models (> {cutoff:,} vars)", lambda n: n > cutoff, True),
+        ]
+    ):
+        ax = axes[1, panel_idx]
+        has_data = False
+        for i, (label, nvars, med, lo, hi) in enumerate(all_stats):
+            idx = [j for j, n in enumerate(nvars) if filt(n)]
+            if not idx:
+                continue
+            has_data = True
+            color = COLORS[i % len(COLORS)]
+            marker = MARKERS[i % len(MARKERS)]
+            ls = "--" if i == 0 else "-"
+            _plot_errorbar(
+                ax,
+                [nvars[j] for j in idx],
+                [med[j] for j in idx],
+                [lo[j] for j in idx],
+                [hi[j] for j in idx],
+                marker=marker,
+                color=color,
+                linestyle=ls,
+                label=label,
+                alpha=0.8,
+            )
+        if use_log and has_data:
+            ax.set_xscale("log")
+        if not use_log:
+            ax.set_ylim(bottom=0)
+        ax.set_xlabel("Number of variables")
+        ax.set_ylabel(ylabel)
+        ax.set_title(title)
+        ax.legend(fontsize=9)
+        ax.grid(True, alpha=0.3)
+        if not has_data:
+            ax.text(
+                0.5,
+                0.5,
+                "No data",
+                ha="center",
+                va="center",
+                transform=ax.transAxes,
+                fontsize=12,
+                color="gray",
+            )
 
+    plt.tight_layout()
     out_png = Path(paths[0]).parent / f"compare_{model_name}_{phase}.png"
-    plt.savefig(out_png, dpi=150)
+    plt.savefig(out_png, dpi=150, bbox_inches="tight")
     print(f"Saved: {out_png}")
     plt.close()

From db990b99b0c136ba9e6f673a9f255af16a7549e4 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:20:01 +0100
Subject: [PATCH 12/27] Improve just file

---
 justfile | 75 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/justfile b/justfile
index aeae1ac4..c797db47 100644
--- a/justfile
+++ b/justfile
@@ -1,42 +1,44 @@
-default_iterations := "30"
+default_iterations := "10"
 results_dir := "benchmarks/results"
 
 # Run all phases for all models
 bench label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
-# Benchmark build phase only
-bench-build label="dev" iterations=default_iterations:
-    python -c "from benchmarks.run import run_phase; run_phase('build', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
-
-# Benchmark memory phase only
-bench-memory label="dev":
-    python -c "from benchmarks.run import run_phase; run_phase('memory', label='{{label}}', output_dir='{{results_dir}}')"
-
-# Benchmark LP write phase only
-bench-write label="dev" iterations=default_iterations:
-    python -c "from benchmarks.run import run_phase; run_phase('lp_write', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
-
 # Run a single model + phase
-bench-model model phase="memory" label="dev" iterations=default_iterations:
-    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+bench-model model phase="build" label="dev" iterations=default_iterations quick="True":
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
-# Quick smoke test (basic model only, small sizes)
+# Quick smoke test (basic model, all phases, small sizes)
 bench-quick label="dev":
-    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='{{label}}', iterations=5, quick=True, output_dir='{{results_dir}}')"
+    just bench-run basic build {{label}} 5 True
+    just bench-run basic memory {{label}} 5 True
+    just bench-run basic lp_write {{label}} 5 True
+
+# Internal: run a single model+phase (used by other recipes)
+[private]
+bench-run model phase label iterations quick:
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
-# Benchmark a branch vs current: checkout ref, run bench-quick, return, run bench-quick here, compare
+# Benchmark a branch vs current, then compare
 # Usage: just bench-branch FBumann:perf/lp-write-speed-combined
-#        just bench-branch origin/master
-#        just bench-branch my-local-branch
-bench-branch ref:
+#        just bench-branch origin/master model=pypsa_scigrid phase=lp_write
+#        just bench-branch my-branch iterations=20 quick=false
+bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
     home_label=$(echo "$home_branch" | tr '/:' '--')
     ref_label=$(echo "{{ref}}" | tr '/:' '--')
+
+    # Determine phases to run
+    if [[ "{{phase}}" == "all" ]]; then
+        phases="build memory lp_write"
+    else
+        phases="{{phase}}"
+    fi
+
+    # Fetch and checkout target ref
     ref="{{ref}}"
     if [[ "$ref" == *:* ]]; then
         remote="${ref%%:*}"
@@ -48,24 +50,29 @@ bench-branch ref:
         git fetch origin --no-tags --no-recurse-submodules 2>&1 || true
         checkout_ref="$ref"
     fi
+
     echo ">>> Checking out $checkout_ref ..."
     git checkout --detach "$checkout_ref"
     pip install -e . --quiet 2>/dev/null || true
-    echo ">>> Running bench-quick on $ref_label ..."
-    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$ref_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+
+    echo ">>> Benchmarking $ref_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
+    for phase in $phases; do
+        just bench-run "{{model}}" "$phase" "$ref_label" "{{iterations}}" "{{quick}}"
+    done
+
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
     pip install -e . --quiet 2>/dev/null || true
-    echo ">>> Running bench-quick on $home_label ..."
-    python -c "from benchmarks.run import run_single; run_single('basic', 'build', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'memory', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
-    python -c "from benchmarks.run import run_single; run_single('basic', 'lp_write', label='$home_label', iterations=5, quick=True, output_dir='benchmarks/results')"
+
+    echo ">>> Benchmarking $home_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
+    for phase in $phases; do
+        just bench-run "{{model}}" "$phase" "$home_label" "{{iterations}}" "{{quick}}"
+    done
+
     echo ">>> Comparing results ..."
-    for phase in build memory lp_write; do
-        old="benchmarks/results/${ref_label}_basic_${phase}.json"
-        new="benchmarks/results/${home_label}_basic_${phase}.json"
+    for phase in $phases; do
+        old="benchmarks/results/${ref_label}_{{model}}_${phase}.json"
+        new="benchmarks/results/${home_label}_{{model}}_${phase}.json"
         if [[ -f "$old" && -f "$new" ]]; then
             python -c "from benchmarks.compare import compare; compare('$old', '$new')"
         fi

From 4178eb3aa06f6625fb349ebddae50d9916cd3fbb Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:22:20 +0100
Subject: [PATCH 13/27] Improve just file

---
 justfile | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/justfile b/justfile
index c797db47..8d3e1146 100644
--- a/justfile
+++ b/justfile
@@ -1,29 +1,28 @@
 default_iterations := "10"
 results_dir := "benchmarks/results"
 
+[group: 'run']
 # Run all phases for all models
 bench label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
+[group: 'run']
 # Run a single model + phase
 bench-model model phase="build" label="dev" iterations=default_iterations quick="True":
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
+[group: 'run']
 # Quick smoke test (basic model, all phases, small sizes)
 bench-quick label="dev":
     just bench-run basic build {{label}} 5 True
     just bench-run basic memory {{label}} 5 True
     just bench-run basic lp_write {{label}} 5 True
 
-# Internal: run a single model+phase (used by other recipes)
-[private]
-bench-run model phase label iterations quick:
-    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
 # Benchmark a branch vs current, then compare
 # Usage: just bench-branch FBumann:perf/lp-write-speed-combined
-#        just bench-branch origin/master model=pypsa_scigrid phase=lp_write
-#        just bench-branch my-branch iterations=20 quick=false
+#        just bench-branch origin/master pypsa_scigrid lp_write 20 False
+[group: 'compare']
 bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
@@ -79,10 +78,16 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     done
     echo ">>> Done."
 
-# Compare result JSON files across branches (2 or more)
+# Compare result JSON files manually (2 or more)
+[group: 'compare']
 bench-compare +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
 # List available models and phases
+[group: 'info']
 bench-list:
     python -c "from benchmarks.run import list_available; list_available()"
+
+[private]
+bench-run model phase label iterations quick:
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"

From 9d4d0311f3b6a9d60ed3d0e9c9370266e482127f Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:22:48 +0100
Subject: [PATCH 14/27] Improve just file

---
 justfile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/justfile b/justfile
index 8d3e1146..e5afbd03 100644
--- a/justfile
+++ b/justfile
@@ -1,17 +1,17 @@
 default_iterations := "10"
 results_dir := "benchmarks/results"
 
-[group: 'run']
+[group('run')]
 # Run all phases for all models
 bench label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
-[group: 'run']
+[group('run')]
 # Run a single model + phase
 bench-model model phase="build" label="dev" iterations=default_iterations quick="True":
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
-[group: 'run']
+[group('run')]
 # Quick smoke test (basic model, all phases, small sizes)
 bench-quick label="dev":
     just bench-run basic build {{label}} 5 True
@@ -22,7 +22,7 @@ bench-quick label="dev":
 # Benchmark a branch vs current, then compare
 # Usage: just bench-branch FBumann:perf/lp-write-speed-combined
 #        just bench-branch origin/master pypsa_scigrid lp_write 20 False
-[group: 'compare']
+[group('compare')]
 bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
@@ -79,12 +79,12 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     echo ">>> Done."
 
 # Compare result JSON files manually (2 or more)
-[group: 'compare']
+[group('compare')]
 bench-compare +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
 # List available models and phases
-[group: 'info']
+[group('info')]
 bench-list:
     python -c "from benchmarks.run import list_available; list_available()"
 

From dd3e24fc09299a6c5d34dda2764078f70261e725 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:25:20 +0100
Subject: [PATCH 15/27] Improve just file

---
 justfile | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/justfile b/justfile
index e5afbd03..36b2a231 100644
--- a/justfile
+++ b/justfile
@@ -18,11 +18,10 @@ bench-quick label="dev":
     just bench-run basic memory {{label}} 5 True
     just bench-run basic lp_write {{label}} 5 True
 
-
+[group('compare')]
 # Benchmark a branch vs current, then compare
 # Usage: just bench-branch FBumann:perf/lp-write-speed-combined
 #        just bench-branch origin/master pypsa_scigrid lp_write 20 False
-[group('compare')]
 bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
@@ -78,13 +77,13 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     done
     echo ">>> Done."
 
-# Compare result JSON files manually (2 or more)
 [group('compare')]
+# Compare result JSON files manually (2 or more)
 bench-compare +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
-# List available models and phases
 [group('info')]
+# List available models and phases
 bench-list:
     python -c "from benchmarks.run import list_available; list_available()"
 

From 4f0bec3c9c93ebf797883b984c18e7e5d5f08312 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:28:11 +0100
Subject: [PATCH 16/27] Improve just file

---
 justfile | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/justfile b/justfile
index 36b2a231..5de37bbd 100644
--- a/justfile
+++ b/justfile
@@ -2,26 +2,20 @@ default_iterations := "10"
 results_dir := "benchmarks/results"
 
 [group('run')]
-# Run all phases for all models
 bench label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
 [group('run')]
-# Run a single model + phase
 bench-model model phase="build" label="dev" iterations=default_iterations quick="True":
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
 [group('run')]
-# Quick smoke test (basic model, all phases, small sizes)
 bench-quick label="dev":
     just bench-run basic build {{label}} 5 True
     just bench-run basic memory {{label}} 5 True
     just bench-run basic lp_write {{label}} 5 True
 
 [group('compare')]
-# Benchmark a branch vs current, then compare
-# Usage: just bench-branch FBumann:perf/lp-write-speed-combined
-#        just bench-branch origin/master pypsa_scigrid lp_write 20 False
 bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
@@ -29,14 +23,12 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     home_label=$(echo "$home_branch" | tr '/:' '--')
     ref_label=$(echo "{{ref}}" | tr '/:' '--')
 
-    # Determine phases to run
     if [[ "{{phase}}" == "all" ]]; then
         phases="build memory lp_write"
     else
         phases="{{phase}}"
     fi
 
-    # Fetch and checkout target ref
     ref="{{ref}}"
     if [[ "$ref" == *:* ]]; then
         remote="${ref%%:*}"
@@ -78,12 +70,10 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     echo ">>> Done."
 
 [group('compare')]
-# Compare result JSON files manually (2 or more)
 bench-compare +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
 [group('info')]
-# List available models and phases
 bench-list:
     python -c "from benchmarks.run import list_available; list_available()"
 

From 2522f7a59db65c09644a5936c7257ff1d7643df0 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:33:41 +0100
Subject: [PATCH 17/27] Improve just file

---
 justfile | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/justfile b/justfile
index 5de37bbd..1145ed1d 100644
--- a/justfile
+++ b/justfile
@@ -1,22 +1,22 @@
 default_iterations := "10"
 results_dir := "benchmarks/results"
 
-[group('run')]
-bench label="dev" iterations=default_iterations:
+[group('benchmark')]
+all label="dev" iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
-[group('run')]
-bench-model model phase="build" label="dev" iterations=default_iterations quick="True":
+[group('benchmark')]
+model model phase="build" label="dev" iterations=default_iterations quick="True":
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
-[group('run')]
-bench-quick label="dev":
-    just bench-run basic build {{label}} 5 True
-    just bench-run basic memory {{label}} 5 True
-    just bench-run basic lp_write {{label}} 5 True
+[group('benchmark')]
+quick label="dev":
+    just _run basic build {{label}} 5 True
+    just _run basic memory {{label}} 5 True
+    just _run basic lp_write {{label}} 5 True
 
-[group('compare')]
-bench-branch ref model="basic" phase="all" iterations=default_iterations quick="True":
+[group('benchmark')]
+compare ref model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
@@ -47,7 +47,7 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
 
     echo ">>> Benchmarking $ref_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
     for phase in $phases; do
-        just bench-run "{{model}}" "$phase" "$ref_label" "{{iterations}}" "{{quick}}"
+        just _run "{{model}}" "$phase" "$ref_label" "{{iterations}}" "{{quick}}"
     done
 
     echo ">>> Returning to $home_branch ..."
@@ -56,7 +56,7 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
 
     echo ">>> Benchmarking $home_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
     for phase in $phases; do
-        just bench-run "{{model}}" "$phase" "$home_label" "{{iterations}}" "{{quick}}"
+        just _run "{{model}}" "$phase" "$home_label" "{{iterations}}" "{{quick}}"
     done
 
     echo ">>> Comparing results ..."
@@ -69,14 +69,14 @@ bench-branch ref model="basic" phase="all" iterations=default_iterations quick="
     done
     echo ">>> Done."
 
-[group('compare')]
-bench-compare +files:
+[group('benchmark')]
+plot +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}
 
-[group('info')]
-bench-list:
+[group('benchmark')]
+list:
     python -c "from benchmarks.run import list_available; list_available()"
 
 [private]
-bench-run model phase label iterations quick:
+_run model phase label iterations quick:
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"

From 42ddf8b4a418036d82c713a184ac14a998222a67 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:34:20 +0100
Subject: [PATCH 18/27] Improve README.md

---
 benchmarks/README.md | 65 ++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index bdacd115..fe05019d 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -2,52 +2,45 @@
 
 Modular benchmark framework for linopy. All commands use [`just`](https://github.com/casey/just).
 
-## Quick Start
+```
+$ just --list
+Available recipes:
+    [benchmark]
+    all label="dev" iterations=default_iterations
+    compare ref model="basic" phase="all" iterations=default_iterations quick="True"
+    list
+    model model phase="build" label="dev" iterations=default_iterations quick="True"
+    plot +files
+    quick label="dev"
+```
 
-```bash
-# Install just (macOS)
-brew install just
+Start with `just list` to see available models and phases, then `just quick` for a smoke test.
 
-# List available models and phases
-just bench-list
+## Examples
 
-# Quick smoke test
-just bench-quick
+```bash
+# Discover available models and phases
+just list
 
-# Full benchmark suite
-just bench label="my-branch"
+# Quick smoke test (basic model, all phases, 5 iterations)
+just quick
 
-# Single phase
-just bench-build label="my-branch"
-just bench-memory label="my-branch"
-just bench-write label="my-branch"
+# Full suite
+just all label="my-branch"
 
 # Single model + phase
-just bench-model basic build label="my-branch"
+just model knapsack memory label="my-branch" iterations=20
 
-# Compare two runs
-just bench-compare benchmarks/results/old_basic_build.json benchmarks/results/new_basic_build.json
-```
-
-## Models
+# Compare current branch against master
+just compare master
 
-| Name | Description |
-|------|-------------|
-| `basic` | 2×N² vars/cons — simple dense model |
-| `knapsack` | N binary variables — integer programming |
-| `pypsa_scigrid` | Real power system from PyPSA SciGrid-DE |
-| `sparse` | Sparse ring network — exercises alignment |
-| `large_expr` | Many-term expressions — stress test |
+# Compare against a remote fork
+just compare FBumann:perf/lp-write-speed model="basic" phase="lp_write"
 
-## Phases
-
-| Name | Description |
-|------|-------------|
-| `build` | Model construction speed (time) |
-| `memory` | Peak memory via tracemalloc |
-| `lp_write` | LP file writing speed |
+# Plot existing result files
+just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_basic_build.json
+```
 
 ## Output
 
-Results are saved as JSON files in `benchmarks/results/` (gitignored).
-Pattern: `{label}_{model}_{phase}.json`
+Results are saved as JSON in `benchmarks/results/` (gitignored), named `{label}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.

From 5f480447fa87a33d3781c501f3c20247d5be3d1a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:39:05 +0100
Subject: [PATCH 19/27] Use name instead of label

---
 benchmarks/compare.py          |  4 ++--
 benchmarks/run.py              | 18 ++++++++--------
 benchmarks/runners/build.py    |  4 ++--
 benchmarks/runners/lp_write.py |  4 ++--
 benchmarks/runners/memory.py   |  4 ++--
 justfile                       | 38 +++++++++++++++++-----------------
 6 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/benchmarks/compare.py b/benchmarks/compare.py
index c22fe160..6c6acba7 100644
--- a/benchmarks/compare.py
+++ b/benchmarks/compare.py
@@ -28,7 +28,7 @@
 def _load(path: str) -> dict:
     with open(path) as f:
         data = json.load(f)
-    data.setdefault("label", Path(path).stem)
+    data.setdefault("name", Path(path).stem)
     return data
 
 
@@ -89,7 +89,7 @@ def compare(*paths: str) -> None:
     all_stats = []
     for d in datasets:
         nvars, med, lo, hi = _extract(d.get("runs", []), phase)
-        all_stats.append((d["label"], nvars, med, lo, hi))
+        all_stats.append((d.get("name", d.get("label", "unknown")), nvars, med, lo, hi))
 
     if not all_stats[0][1]:
         print("No data to plot.")
diff --git a/benchmarks/run.py b/benchmarks/run.py
index fb9046e9..6a831e2b 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -12,7 +12,7 @@
 def run_single(
     model_name: str,
     phase: str,
-    label: str = "dev",
+    name: str = "dev",
     iterations: int = 30,
     quick: bool = False,
     output_dir: str = "benchmarks/results",
@@ -27,7 +27,7 @@ def run_single(
     )
 
     results = {
-        "label": label,
+        "name": name,
         "model": model_name,
         "phase": phase,
         "runs": [],
@@ -37,7 +37,7 @@ def run_single(
         desc = model_mod.LABEL.format(**kwargs)
         print(f"  {desc} ... ", end="", flush=True)
         res = runner.run(
-            label=label,
+            name=name,
             builder=model_mod.build,
             builder_args=kwargs,
             iterations=iterations,
@@ -49,7 +49,7 @@ def run_single(
         # Print a compact summary
         summary_parts = []
         for key, val in res.items():
-            if key in ("phase", "label", "params", "iterations"):
+            if key in ("phase", "name", "params", "iterations"):
                 continue
             if isinstance(val, float):
                 summary_parts.append(f"{key}={val:.3f}")
@@ -60,7 +60,7 @@ def run_single(
     # Save
     out_path = Path(output_dir)
     out_path.mkdir(parents=True, exist_ok=True)
-    filename = out_path / f"{label}_{model_name}_{phase}.json"
+    filename = out_path / f"{name}_{model_name}_{phase}.json"
     with open(filename, "w") as f:
         json.dump(results, f, indent=2)
     print(f"  Saved: {filename}")
@@ -69,7 +69,7 @@ def run_single(
 
 def run_phase(
     phase: str,
-    label: str = "dev",
+    name: str = "dev",
     iterations: int = 30,
     quick: bool = False,
     output_dir: str = "benchmarks/results",
@@ -81,7 +81,7 @@ def run_phase(
         res = run_single(
             model_name,
             phase,
-            label=label,
+            name=name,
             iterations=iterations,
             quick=quick,
             output_dir=output_dir,
@@ -91,7 +91,7 @@ def run_phase(
 
 
 def run_all(
-    label: str = "dev",
+    name: str = "dev",
     iterations: int = 30,
     quick: bool = False,
     output_dir: str = "benchmarks/results",
@@ -104,7 +104,7 @@ def run_all(
         print(f"{'=' * 60}")
         results = run_phase(
             phase,
-            label=label,
+            name=name,
             iterations=iterations,
             quick=quick,
             output_dir=output_dir,
diff --git a/benchmarks/runners/build.py b/benchmarks/runners/build.py
index e9436565..ddbac59e 100644
--- a/benchmarks/runners/build.py
+++ b/benchmarks/runners/build.py
@@ -11,7 +11,7 @@
 
 
 def run(
-    label: str,
+    name: str,
     builder,
     builder_args: dict,
     iterations: int = 30,
@@ -46,7 +46,7 @@ def run(
     times_arr = np.array(times)
     return {
         "phase": PHASE,
-        "label": label,
+        "name": name,
         "params": builder_args,
         "iterations": iterations,
         "build_time_median_s": float(np.median(times_arr)),
diff --git a/benchmarks/runners/lp_write.py b/benchmarks/runners/lp_write.py
index d0ee745f..9d80a896 100644
--- a/benchmarks/runners/lp_write.py
+++ b/benchmarks/runners/lp_write.py
@@ -13,7 +13,7 @@
 
 
 def run(
-    label: str,
+    name: str,
     builder,
     builder_args: dict,
     iterations: int = 10,
@@ -49,7 +49,7 @@ def run(
     times_arr = np.array(times)
     return {
         "phase": PHASE,
-        "label": label,
+        "name": name,
         "params": builder_args,
         "iterations": iterations,
         "write_time_median_s": float(np.median(times_arr)),
diff --git a/benchmarks/runners/memory.py b/benchmarks/runners/memory.py
index 15526684..951230b8 100644
--- a/benchmarks/runners/memory.py
+++ b/benchmarks/runners/memory.py
@@ -11,7 +11,7 @@
 
 
 def run(
-    label: str,
+    name: str,
     builder,
     builder_args: dict,
     iterations: int = 5,
@@ -60,7 +60,7 @@ def run(
     peaks_arr = np.array(peaks)
     return {
         "phase": PHASE,
-        "label": label,
+        "name": name,
         "params": builder_args,
         "iterations": iterations,
         "peak_memory_median_mb": float(np.median(peaks_arr)),
diff --git a/justfile b/justfile
index 1145ed1d..6782734e 100644
--- a/justfile
+++ b/justfile
@@ -2,26 +2,26 @@ default_iterations := "10"
 results_dir := "benchmarks/results"
 
 [group('benchmark')]
-all label="dev" iterations=default_iterations:
-    python -c "from benchmarks.run import run_all; run_all('{{label}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
+all name iterations=default_iterations:
+    python -c "from benchmarks.run import run_all; run_all(name='{{name}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
 [group('benchmark')]
-model model phase="build" label="dev" iterations=default_iterations quick="True":
-    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
+model name model phase="all" iterations=default_iterations quick="False":
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', name='{{name}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
 [group('benchmark')]
-quick label="dev":
-    just _run basic build {{label}} 5 True
-    just _run basic memory {{label}} 5 True
-    just _run basic lp_write {{label}} 5 True
+quick name="quick":
+    just _run basic build {{name}} 5 True
+    just _run basic memory {{name}} 5 True
+    just _run basic lp_write {{name}} 5 True
 
 [group('benchmark')]
-compare ref model="basic" phase="all" iterations=default_iterations quick="True":
+compare ref="master" model="basic" phase="all" iterations=default_iterations quick="True":
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
-    home_label=$(echo "$home_branch" | tr '/:' '--')
-    ref_label=$(echo "{{ref}}" | tr '/:' '--')
+    home_name=$(echo "$home_branch" | tr '/:' '--')
+    ref_name=$(echo "{{ref}}" | tr '/:' '--')
 
     if [[ "{{phase}}" == "all" ]]; then
         phases="build memory lp_write"
@@ -45,24 +45,24 @@ compare ref model="basic" phase="all" iterations=default_iterations quick="True"
     git checkout --detach "$checkout_ref"
     pip install -e . --quiet 2>/dev/null || true
 
-    echo ">>> Benchmarking $ref_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
+    echo ">>> Benchmarking $ref_name (model={{model}}, phases=$phases, quick={{quick}}) ..."
     for phase in $phases; do
-        just _run "{{model}}" "$phase" "$ref_label" "{{iterations}}" "{{quick}}"
+        just _run "{{model}}" "$phase" "$ref_name" "{{iterations}}" "{{quick}}"
     done
 
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
     pip install -e . --quiet 2>/dev/null || true
 
-    echo ">>> Benchmarking $home_label (model={{model}}, phases=$phases, quick={{quick}}) ..."
+    echo ">>> Benchmarking $home_name (model={{model}}, phases=$phases, quick={{quick}}) ..."
     for phase in $phases; do
-        just _run "{{model}}" "$phase" "$home_label" "{{iterations}}" "{{quick}}"
+        just _run "{{model}}" "$phase" "$home_name" "{{iterations}}" "{{quick}}"
     done
 
     echo ">>> Comparing results ..."
     for phase in $phases; do
-        old="benchmarks/results/${ref_label}_{{model}}_${phase}.json"
-        new="benchmarks/results/${home_label}_{{model}}_${phase}.json"
+        old="benchmarks/results/${ref_name}_{{model}}_${phase}.json"
+        new="benchmarks/results/${home_name}_{{model}}_${phase}.json"
         if [[ -f "$old" && -f "$new" ]]; then
             python -c "from benchmarks.compare import compare; compare('$old', '$new')"
         fi
@@ -78,5 +78,5 @@ list:
     python -c "from benchmarks.run import list_available; list_available()"
 
 [private]
-_run model phase label iterations quick:
-    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', label='{{label}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
+_run model phase name iterations quick:
+    python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', name='{{name}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"

From 3a482129f6a7f2f9692841434af4412895deb33f Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:41:56 +0100
Subject: [PATCH 20/27] Update README.md

---
 benchmarks/README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index fe05019d..a0f37f8c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,12 +6,12 @@ Modular benchmark framework for linopy. All commands use [`just`](https://github
 $ just --list
 Available recipes:
     [benchmark]
-    all label="dev" iterations=default_iterations
-    compare ref model="basic" phase="all" iterations=default_iterations quick="True"
+    all name iterations=default_iterations
+    compare ref="master" model="basic" phase="all" iterations=default_iterations quick="True"
     list
-    model model phase="build" label="dev" iterations=default_iterations quick="True"
+    model name model phase="all" iterations=default_iterations quick="False"
     plot +files
-    quick label="dev"
+    quick name="quick"
 ```
 
 Start with `just list` to see available models and phases, then `just quick` for a smoke test.
@@ -26,16 +26,16 @@ just list
 just quick
 
 # Full suite
-just all label="my-branch"
+just all my-branch
 
 # Single model + phase
-just model knapsack memory label="my-branch" iterations=20
+just model my-branch knapsack memory iterations=20
 
 # Compare current branch against master
-just compare master
+just compare
 
-# Compare against a remote fork
-just compare FBumann:perf/lp-write-speed model="basic" phase="lp_write"
+# Compare against a remote fork, specific phase
+just compare ref=FBumann:perf/lp-write-speed phase=lp_write
 
 # Plot existing result files
 just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_basic_build.json
@@ -43,4 +43,4 @@ just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_bas
 
 ## Output
 
-Results are saved as JSON in `benchmarks/results/` (gitignored), named `{label}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.
+Results are saved as JSON in `benchmarks/results/` (gitignored), named `{name}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.

From 967dbf3a0192453d04226c0f7f5886f25954bdd2 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:45:49 +0100
Subject: [PATCH 21/27] Update justfile and README.md

---
 benchmarks/README.md | 31 +++++++++++++++++++++++++------
 justfile             |  7 +++++--
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index a0f37f8c..e663d44d 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,9 +7,9 @@ $ just --list
 Available recipes:
     [benchmark]
     all name iterations=default_iterations
-    compare ref="master" model="basic" phase="all" iterations=default_iterations quick="True"
+    compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick=default_quick
     list
-    model name model phase="all" iterations=default_iterations quick="False"
+    model name model phase=default_phase iterations=default_iterations quick="False"
     plot +files
     quick name="quick"
 ```
@@ -29,18 +29,37 @@ just quick
 just all my-branch
 
 # Single model + phase
-just model my-branch knapsack memory iterations=20
+just model my-branch knapsack memory
 
-# Compare current branch against master
+# Compare current branch against master (all phases, basic model)
 just compare
 
-# Compare against a remote fork, specific phase
-just compare ref=FBumann:perf/lp-write-speed phase=lp_write
+# Compare against another branch
+just compare perf/lp-write-speed-combined-bench
+
+# Compare against a remote fork
+just compare FBumann:perf/lp-write-speed
 
 # Plot existing result files
 just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_basic_build.json
 ```
 
+## Overriding defaults
+
+Recipe parameters that show `=default_*` reference top-level variables in the justfile.
+Override them with `--set` on the command line:
+
+```bash
+# Run compare with quick mode
+just --set default_quick True compare perf/lp-write-speed
+
+# Compare only the lp_write phase
+just --set default_phase lp_write compare perf/lp-write-speed
+
+# Combine multiple overrides
+just --set default_quick True --set default_phase build compare perf/lp-write-speed
+```
+
 ## Output
 
 Results are saved as JSON in `benchmarks/results/` (gitignored), named `{name}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.
diff --git a/justfile b/justfile
index 6782734e..65d2e524 100644
--- a/justfile
+++ b/justfile
@@ -1,4 +1,7 @@
 default_iterations := "10"
+default_model := "basic"
+default_phase := "all"
+default_quick := "False"
 results_dir := "benchmarks/results"
 
 [group('benchmark')]
@@ -6,7 +9,7 @@ all name iterations=default_iterations:
     python -c "from benchmarks.run import run_all; run_all(name='{{name}}', iterations={{iterations}}, output_dir='{{results_dir}}')"
 
 [group('benchmark')]
-model name model phase="all" iterations=default_iterations quick="False":
+model name model phase=default_phase iterations=default_iterations quick="False":
     python -c "from benchmarks.run import run_single; run_single('{{model}}', '{{phase}}', name='{{name}}', iterations={{iterations}}, quick={{quick}}, output_dir='{{results_dir}}')"
 
 [group('benchmark')]
@@ -16,7 +19,7 @@ quick name="quick":
     just _run basic lp_write {{name}} 5 True
 
 [group('benchmark')]
-compare ref="master" model="basic" phase="all" iterations=default_iterations quick="True":
+compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick=default_quick:
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)

From 270b28f6c2c3aedaecb6b79a792fec0b5323b1a4 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:48:29 +0100
Subject: [PATCH 22/27] Update justfile and README.md

---
 benchmarks/README.md           | 18 ++++++------------
 benchmarks/runners/lp_write.py |  4 ++--
 justfile                       |  7 +++++--
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index e663d44d..a5506631 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,7 +7,8 @@ $ just --list
 Available recipes:
     [benchmark]
     all name iterations=default_iterations
-    compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick=default_quick
+    compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick="False"
+    compare-quick ref="master"
     list
     model name model phase=default_phase iterations=default_iterations quick="False"
     plot +files
@@ -34,8 +35,8 @@ just model my-branch knapsack memory
 # Compare current branch against master (all phases, basic model)
 just compare
 
-# Compare against another branch
-just compare perf/lp-write-speed-combined-bench
+# Quick compare (small sizes, 5 iterations)
+just compare-quick perf/lp-write-speed
 
 # Compare against a remote fork
 just compare FBumann:perf/lp-write-speed
@@ -46,18 +47,11 @@ just plot benchmarks/results/master_basic_build.json benchmarks/results/feat_bas
 
 ## Overriding defaults
 
-Recipe parameters that show `=default_*` reference top-level variables in the justfile.
-Override them with `--set` on the command line:
+Parameters showing `=default_*` reference top-level justfile variables. Override them with `--set`:
 
 ```bash
-# Run compare with quick mode
-just --set default_quick True compare perf/lp-write-speed
-
-# Compare only the lp_write phase
 just --set default_phase lp_write compare perf/lp-write-speed
-
-# Combine multiple overrides
-just --set default_quick True --set default_phase build compare perf/lp-write-speed
+just --set default_model knapsack --set default_iterations 20 compare master
 ```
 
 ## Output
diff --git a/benchmarks/runners/lp_write.py b/benchmarks/runners/lp_write.py
index 9d80a896..f681699e 100644
--- a/benchmarks/runners/lp_write.py
+++ b/benchmarks/runners/lp_write.py
@@ -37,12 +37,12 @@ def run(
         lp_path = Path(tmpdir) / "model.lp"
 
         # Warmup
-        model.to_file(lp_path)
+        model.to_file(lp_path, progress=False)
 
         for _ in range(iterations):
             gc.collect()
             t0 = time.perf_counter()
-            model.to_file(lp_path)
+            model.to_file(lp_path, progress=False)
             elapsed = time.perf_counter() - t0
             times.append(elapsed)
 
diff --git a/justfile b/justfile
index 65d2e524..8c8881cc 100644
--- a/justfile
+++ b/justfile
@@ -1,7 +1,6 @@
 default_iterations := "10"
 default_model := "basic"
 default_phase := "all"
-default_quick := "False"
 results_dir := "benchmarks/results"
 
 [group('benchmark')]
@@ -19,7 +18,7 @@ quick name="quick":
     just _run basic lp_write {{name}} 5 True
 
 [group('benchmark')]
-compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick=default_quick:
+compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick="False":
     #!/usr/bin/env bash
     set -euo pipefail
     home_branch=$(git rev-parse --abbrev-ref HEAD)
@@ -72,6 +71,10 @@ compare ref="master" model=default_model phase=default_phase iterations=default_
     done
     echo ">>> Done."
 
+[group('benchmark')]
+compare-quick ref="master":
+    just compare {{ref}} basic all 10 True
+
 [group('benchmark')]
 plot +files:
     python -c "import sys; from benchmarks.compare import compare; compare(*sys.argv[1:])" {{files}}

From 1e3dee77739aa76ebd591f2e105191129b332661 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:57:29 +0100
Subject: [PATCH 23/27] Add all

---
 benchmarks/models/basic.py      |  4 ++--
 benchmarks/models/knapsack.py   |  4 ++--
 benchmarks/models/large_expr.py | 11 +++++----
 benchmarks/models/sparse.py     |  8 ++++---
 justfile                        | 42 +++++++++++++++++++++++----------
 5 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index f96abcec..21ae643c 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -5,8 +5,8 @@
 import linopy
 
 LABEL = "basic N={n}"
-SIZES = [{"n": n} for n in [5, 10, 25, 50, 100, 200, 500]]
-QUICK_SIZES = [{"n": n} for n in [5, 10, 25]]
+SIZES = [{"n": n} for n in [10, 50, 100, 250, 500, 1000, 1600]]
+QUICK_SIZES = [{"n": n} for n in [10, 50, 100]]
 DESCRIPTION = "2*N^2 vars/cons — simple dense model"
 
 
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 38c3c189..d835d003 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -7,8 +7,8 @@
 import linopy
 
 LABEL = "knapsack N={n}"
-SIZES = [{"n": n} for n in [10, 50, 100, 500, 1000]]
-QUICK_SIZES = [{"n": n} for n in [10, 50]]
+SIZES = [{"n": n} for n in [100, 1_000, 10_000, 100_000, 1_000_000, 5_000_000]]
+QUICK_SIZES = [{"n": n} for n in [100, 1_000, 10_000]]
 DESCRIPTION = "N binary variables — integer programming stress test"
 
 
diff --git a/benchmarks/models/large_expr.py b/benchmarks/models/large_expr.py
index b537e541..6854c5c5 100644
--- a/benchmarks/models/large_expr.py
+++ b/benchmarks/models/large_expr.py
@@ -6,12 +6,15 @@
 
 LABEL = "large_expr N={n_constraints} K={terms_per_constraint}"
 SIZES = [
-    {"n_constraints": 100, "terms_per_constraint": 10},
-    {"n_constraints": 500, "terms_per_constraint": 50},
-    {"n_constraints": 1000, "terms_per_constraint": 100},
+    {"n_constraints": 100, "terms_per_constraint": 100},
+    {"n_constraints": 500, "terms_per_constraint": 200},
+    {"n_constraints": 1000, "terms_per_constraint": 500},
+    {"n_constraints": 2000, "terms_per_constraint": 1000},
+    {"n_constraints": 5000, "terms_per_constraint": 1000},
 ]
 QUICK_SIZES = [
-    {"n_constraints": 100, "terms_per_constraint": 10},
+    {"n_constraints": 100, "terms_per_constraint": 100},
+    {"n_constraints": 500, "terms_per_constraint": 200},
 ]
 DESCRIPTION = "N constraints each summing K variables — expression building stress test"
 
diff --git a/benchmarks/models/sparse.py b/benchmarks/models/sparse.py
index e73e6a54..dfecf142 100644
--- a/benchmarks/models/sparse.py
+++ b/benchmarks/models/sparse.py
@@ -8,14 +8,16 @@
 
 LABEL = "sparse N={n_buses} T={n_time}"
 SIZES = [
-    {"n_buses": 20, "n_time": 24},
     {"n_buses": 50, "n_time": 50},
     {"n_buses": 100, "n_time": 100},
-    {"n_buses": 200, "n_time": 200},
+    {"n_buses": 250, "n_time": 250},
+    {"n_buses": 500, "n_time": 500},
+    {"n_buses": 1000, "n_time": 1000},
+    {"n_buses": 1600, "n_time": 1600},
 ]
 QUICK_SIZES = [
-    {"n_buses": 20, "n_time": 24},
     {"n_buses": 50, "n_time": 50},
+    {"n_buses": 100, "n_time": 100},
 ]
 DESCRIPTION = "Sparse ring network — exercises outer-join alignment"
 
diff --git a/justfile b/justfile
index 8c8881cc..501fd7be 100644
--- a/justfile
+++ b/justfile
@@ -31,6 +31,12 @@ compare ref="master" model=default_model phase=default_phase iterations=default_
         phases="{{phase}}"
     fi
 
+    if [[ "{{model}}" == "all" ]]; then
+        models=$(python -c "from benchmarks.models import list_models; print(' '.join(list_models()))")
+    else
+        models="{{model}}"
+    fi
+
     ref="{{ref}}"
     if [[ "$ref" == *:* ]]; then
         remote="${ref%%:*}"
@@ -47,33 +53,43 @@ compare ref="master" model=default_model phase=default_phase iterations=default_
     git checkout --detach "$checkout_ref"
     pip install -e . --quiet 2>/dev/null || true
 
-    echo ">>> Benchmarking $ref_name (model={{model}}, phases=$phases, quick={{quick}}) ..."
-    for phase in $phases; do
-        just _run "{{model}}" "$phase" "$ref_name" "{{iterations}}" "{{quick}}"
+    echo ">>> Benchmarking $ref_name (models=$models, phases=$phases, quick={{quick}}) ..."
+    for model in $models; do
+        for phase in $phases; do
+            just _run "$model" "$phase" "$ref_name" "{{iterations}}" "{{quick}}"
+        done
     done
 
     echo ">>> Returning to $home_branch ..."
     git checkout "$home_branch"
     pip install -e . --quiet 2>/dev/null || true
 
-    echo ">>> Benchmarking $home_name (model={{model}}, phases=$phases, quick={{quick}}) ..."
-    for phase in $phases; do
-        just _run "{{model}}" "$phase" "$home_name" "{{iterations}}" "{{quick}}"
+    echo ">>> Benchmarking $home_name (models=$models, phases=$phases, quick={{quick}}) ..."
+    for model in $models; do
+        for phase in $phases; do
+            just _run "$model" "$phase" "$home_name" "{{iterations}}" "{{quick}}"
+        done
     done
 
     echo ">>> Comparing results ..."
-    for phase in $phases; do
-        old="benchmarks/results/${ref_name}_{{model}}_${phase}.json"
-        new="benchmarks/results/${home_name}_{{model}}_${phase}.json"
-        if [[ -f "$old" && -f "$new" ]]; then
-            python -c "from benchmarks.compare import compare; compare('$old', '$new')"
-        fi
+    for model in $models; do
+        for phase in $phases; do
+            old="benchmarks/results/${ref_name}_${model}_${phase}.json"
+            new="benchmarks/results/${home_name}_${model}_${phase}.json"
+            if [[ -f "$old" && -f "$new" ]]; then
+                python -c "from benchmarks.compare import compare; compare('$old', '$new')"
+            fi
+        done
     done
     echo ">>> Done."
 
+[group('benchmark')]
+compare-all ref="master" iterations=default_iterations:
+    just compare {{ref}} all all {{iterations}} False
+
 [group('benchmark')]
 compare-quick ref="master":
-    just compare {{ref}} basic all 10 True
+    just compare {{ref}} basic all 5 True
 
 [group('benchmark')]
 plot +files:

From 709349612e0f2b4b307598e3be50e311ac2842e5 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 15:57:34 +0100
Subject: [PATCH 24/27] Add all

---
 benchmarks/README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index a5506631..6a5f8e7f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -8,6 +8,7 @@ Available recipes:
     [benchmark]
     all name iterations=default_iterations
     compare ref="master" model=default_model phase=default_phase iterations=default_iterations quick="False"
+    compare-all ref="master" iterations=default_iterations
     compare-quick ref="master"
     list
     model name model phase=default_phase iterations=default_iterations quick="False"
@@ -26,16 +27,19 @@ just list
 # Quick smoke test (basic model, all phases, 5 iterations)
 just quick
 
-# Full suite
+# Full suite (all models, all phases)
 just all my-branch
 
 # Single model + phase
 just model my-branch knapsack memory
 
-# Compare current branch against master (all phases, basic model)
+# Compare current branch against master (basic model, all phases)
 just compare
 
-# Quick compare (small sizes, 5 iterations)
+# Compare all models against master
+just compare-all
+
+# Quick compare (basic model, small sizes, 5 iterations)
 just compare-quick perf/lp-write-speed
 
 # Compare against a remote fork

From f3810217b76e49e162c8a509814a20c564f5f693 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 16:09:29 +0100
Subject: [PATCH 25/27] Vectorize sparse.py

---
 benchmarks/models/sparse.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/benchmarks/models/sparse.py b/benchmarks/models/sparse.py
index dfecf142..feaccd29 100644
--- a/benchmarks/models/sparse.py
+++ b/benchmarks/models/sparse.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import numpy as np
+import xarray as xr
 
 import linopy
 
@@ -46,18 +47,21 @@ def build(n_buses: int, n_time: int) -> linopy.Model:
     m.add_constraints(flow <= 100, name="flow_upper")
     m.add_constraints(flow >= -100, name="flow_lower")
 
-    # Bus balance: gen[b] + inflow - outflow = demand[b]
-    rng = np.random.default_rng(42)
-    demand = rng.uniform(10, 50, size=(n_buses, n_time))
+    # Bus balance: gen[b] + inflow[b] - outflow[b] = demand[b]
+    # In a ring: line b-1 flows into bus b, line b flows out of bus b
+    # Rename line→bus so dimensions align for vectorized constraint
+    inflow = flow.roll(line=1).assign_coords(line=list(buses)).rename(line="bus")
+    outflow = flow.assign_coords(line=list(buses)).rename(line="bus")
+    balance = gen + inflow - outflow
 
-    for b in buses:
-        # Lines into bus b: line (b-1) % n_buses flows into b
-        # Lines out of bus b: line b flows out of b
-        line_in = (b - 1) % n_buses
-        line_out = b
-        balance = gen.sel(bus=b) + flow.sel(line=line_in) - flow.sel(line=line_out)
-        m.add_constraints(balance == demand[b], name=f"balance_{b}")
+    rng = np.random.default_rng(42)
+    demand = xr.DataArray(
+        rng.uniform(10, 50, size=(n_buses, n_time)),
+        coords=[list(buses), list(time)],
+        dims=["bus", "time"],
+    )
+    m.add_constraints(balance == demand, name="balance")
 
-    # Generation cost (sum over time first, then weight by bus cost)
+    # Generation cost
     m.add_objective(gen.sum("time"))
     return m

From b47be5a17c6ade056b5b67ee5cd422d4e11a2fe8 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 16:11:17 +0100
Subject: [PATCH 26/27] Remove sparse.py

---
 benchmarks/README.md        |  4 +++
 benchmarks/models/sparse.py | 67 -------------------------------------
 2 files changed, 4 insertions(+), 67 deletions(-)
 delete mode 100644 benchmarks/models/sparse.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6a5f8e7f..6157cdc7 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -61,3 +61,7 @@ just --set default_model knapsack --set default_iterations 20 compare master
 ## Output
 
 Results are saved as JSON in `benchmarks/results/` (gitignored), named `{name}_{model}_{phase}.json`. Comparison plots are saved as PNG alongside.
+
+## Ideas for future models
+
+- **sparse**: A model with variables on mismatched coordinate subsets to exercise sparse/outer-join alignment (e.g. lines indexed by `(bus_from, bus_to)` vs bus-level variables).
diff --git a/benchmarks/models/sparse.py b/benchmarks/models/sparse.py
deleted file mode 100644
index feaccd29..00000000
--- a/benchmarks/models/sparse.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""Sparse topology benchmark: ring network with bus balance constraints."""
-
-from __future__ import annotations
-
-import numpy as np
-import xarray as xr
-
-import linopy
-
-LABEL = "sparse N={n_buses} T={n_time}"
-SIZES = [
-    {"n_buses": 50, "n_time": 50},
-    {"n_buses": 100, "n_time": 100},
-    {"n_buses": 250, "n_time": 250},
-    {"n_buses": 500, "n_time": 500},
-    {"n_buses": 1000, "n_time": 1000},
-    {"n_buses": 1600, "n_time": 1600},
-]
-QUICK_SIZES = [
-    {"n_buses": 50, "n_time": 50},
-    {"n_buses": 100, "n_time": 100},
-]
-DESCRIPTION = "Sparse ring network — exercises outer-join alignment"
-
-
-def build(n_buses: int, n_time: int) -> linopy.Model:
-    """
-    Build a ring-topology network model.
-
-    N buses connected in a ring, each with generation and demand.
-    Flow variables on each line connect adjacent buses.
-    """
-    m = linopy.Model()
-
-    buses = range(n_buses)
-    time = range(n_time)
-    # Ring topology: line i connects bus i to bus (i+1) % n_buses
-    n_lines = n_buses
-    lines = range(n_lines)
-
-    gen = m.add_variables(
-        lower=0, coords=[buses, time], dims=["bus", "time"], name="gen"
-    )
-    flow = m.add_variables(coords=[lines, time], dims=["line", "time"], name="flow")
-
-    # Flow capacity
-    m.add_constraints(flow <= 100, name="flow_upper")
-    m.add_constraints(flow >= -100, name="flow_lower")
-
-    # Bus balance: gen[b] + inflow[b] - outflow[b] = demand[b]
-    # In a ring: line b-1 flows into bus b, line b flows out of bus b
-    # Rename line→bus so dimensions align for vectorized constraint
-    inflow = flow.roll(line=1).assign_coords(line=list(buses)).rename(line="bus")
-    outflow = flow.assign_coords(line=list(buses)).rename(line="bus")
-    balance = gen + inflow - outflow
-
-    rng = np.random.default_rng(42)
-    demand = xr.DataArray(
-        rng.uniform(10, 50, size=(n_buses, n_time)),
-        coords=[list(buses), list(time)],
-        dims=["bus", "time"],
-    )
-    m.add_constraints(balance == demand, name="balance")
-
-    # Generation cost
-    m.add_objective(gen.sum("time"))
-    return m

From 349b477891c0363b8b62c8d8d91cd03b8c532da2 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 16:11:54 +0100
Subject: [PATCH 27/27] Remove large_expr.py

---
 benchmarks/README.md            |  1 +
 benchmarks/models/large_expr.py | 40 ---------------------------------
 2 files changed, 1 insertion(+), 40 deletions(-)
 delete mode 100644 benchmarks/models/large_expr.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6157cdc7..dff9effa 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -65,3 +65,4 @@ Results are saved as JSON in `benchmarks/results/` (gitignored), named `{name}_{
 ## Ideas for future models
 
 - **sparse**: A model with variables on mismatched coordinate subsets to exercise sparse/outer-join alignment (e.g. lines indexed by `(bus_from, bus_to)` vs bus-level variables).
+- **large_expr**: A model that stress-tests expression building — chaining many arithmetic operations, combining expressions from different variable groups, or building expressions incrementally.
diff --git a/benchmarks/models/large_expr.py b/benchmarks/models/large_expr.py
deleted file mode 100644
index 6854c5c5..00000000
--- a/benchmarks/models/large_expr.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Large expression benchmark: many-term expression stress test."""
-
-from __future__ import annotations
-
-import linopy
-
-LABEL = "large_expr N={n_constraints} K={terms_per_constraint}"
-SIZES = [
-    {"n_constraints": 100, "terms_per_constraint": 100},
-    {"n_constraints": 500, "terms_per_constraint": 200},
-    {"n_constraints": 1000, "terms_per_constraint": 500},
-    {"n_constraints": 2000, "terms_per_constraint": 1000},
-    {"n_constraints": 5000, "terms_per_constraint": 1000},
-]
-QUICK_SIZES = [
-    {"n_constraints": 100, "terms_per_constraint": 100},
-    {"n_constraints": 500, "terms_per_constraint": 200},
-]
-DESCRIPTION = "N constraints each summing K variables — expression building stress test"
-
-
-def build(n_constraints: int, terms_per_constraint: int) -> linopy.Model:
-    """Build a model with many-term expressions."""
-    m = linopy.Model()
-
-    # Create variables: one per (constraint, term)
-    x = m.add_variables(
-        lower=0,
-        coords=[range(n_constraints), range(terms_per_constraint)],
-        dims=["constraint", "term"],
-        name="x",
-    )
-
-    # Each constraint sums all terms for that constraint index
-    expr = x.sum("term")
-    m.add_constraints(expr <= 1, name="sum_limit")
-
-    # Objective: sum everything
-    m.add_objective(x.sum())
-    return m