From 995181359ec86952a409595366397f53c19cd626 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Wed, 1 Jul 2026 15:09:40 +0200
Subject: [PATCH 1/2] bench: add arithmetic operation micro-benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An op-level tier alongside the whole-model builds: one benchmark per
(operation, size profile), operands built outside the measured region so a run
isolates a single op rather than a whole build. This attributes perf changes to
a specific arithmetic path — a build benchmark says "kvl got heavier", an op
benchmark says "expr+expr broadcast got heavier".

- benchmarks/ops.py: op registry (OpSpec) + size profiles (small 1D×2000;
  large 3D×3×4×1000 — differ in element count *and* dim count; the asymmetric
  shape also catches dim-order bugs) + ~30 ops across scaling, var/expr
  arithmetic, quadratic, reductions and constraint construction. Binary labelled
  ops carry match/broadcast variants — the alignment-path axis where the
  interesting regressions live.
- benchmarks/drivers/test_ops.py: parametrized driver, one benchmark per
  (op, profile).
- conftest: add test_ops to CODSPEED_MODULES (tracked; memory advisory).

60 benchmarks, ~80s/run with memory. Signal validates: large ≈ 6× small,
broadcast ≈ 5× match (the §9 cross-product).
---
 benchmarks/conftest.py         |   1 +
 benchmarks/drivers/test_ops.py |  27 ++++
 benchmarks/ops.py              | 260 +++++++++++++++++++++++++++++++++
 3 files changed, 288 insertions(+)
 create mode 100644 benchmarks/drivers/test_ops.py
 create mode 100644 benchmarks/ops.py

diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index b9ef6014..58cbdd3d 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -20,6 +20,7 @@
     "test_build",
     "test_to_lp",
     "test_to_solver",
+    "test_ops",
 )
 
 
diff --git a/benchmarks/drivers/test_ops.py b/benchmarks/drivers/test_ops.py
new file mode 100644
index 00000000..774bd5a8
--- /dev/null
+++ b/benchmarks/drivers/test_ops.py
@@ -0,0 +1,27 @@
+"""
+Arithmetic operation micro-benchmarks.
+
+One benchmark per ``(operation, size profile)`` — the operands are built in
+setup (not measured) and the fixture measures a single ``op(*operands)``. See
+:mod:`benchmarks.ops` for the op registry and the size / alignment axes.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import pytest
+
+from benchmarks.ops import OpSpec, Profile, iter_op_params
+
+_CASES = iter_op_params()
+
+
+@pytest.mark.parametrize(
+    "op, profile",
+    _CASES,
+    ids=[f"{op.name}[{profile.key}]" for op, profile in _CASES],
+)
+def test_op(benchmark: Callable[..., object], op: OpSpec, profile: Profile) -> None:
+    operands = op.setup(profile)
+    benchmark(op.op, *operands)
diff --git a/benchmarks/ops.py b/benchmarks/ops.py
new file mode 100644
index 00000000..483e4315
--- /dev/null
+++ b/benchmarks/ops.py
@@ -0,0 +1,260 @@
+"""
+Registry of arithmetic *operation* micro-benchmarks.
+
+Where :mod:`benchmarks.registry` benchmarks whole model builds, this benchmarks
+single operations — ``var * array``, ``expr + expr``, ``expr <= c`` — with the
+operands built *outside* the measured region, so a run isolates one op rather
+than a whole build. That granularity attributes regressions to a specific path
+(a whole-build benchmark says "kvl got heavier"; an op benchmark says "expr+expr
+broadcast got heavier").
+
+Two axes beyond the op itself:
+
+- **size profile** — ``small`` (1-D, 2000) is the cheap time/shape signal;
+  ``large`` (3-D, 3×4×1000) carries the memory signal and exercises multi-dim
+  broadcast/alignment. They differ in element count *and* dimensionality on
+  purpose; the asymmetric 3×4×1000 also catches dim-order/transpose bugs.
+- **alignment** — for binary labelled ops, ``match`` (identical coords, the fast
+  path) vs ``broadcast`` (an extra dim → §9 cross-product). Both are where the
+  alignment-path regressions live, so they're first-class, not incidental.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+import linopy
+
+# --- size profiles ----------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class Profile:
+    """A benchmark size: named dimensions and their lengths."""
+
+    key: str
+    dims: tuple[str, ...]
+    shape: tuple[int, ...]
+
+    @property
+    def size(self) -> int:
+        return int(np.prod(self.shape))
+
+
+SMALL = Profile("small", ("d0",), (2000,))
+LARGE = Profile("large", ("d0", "d1", "d2"), (3, 4, 1000))
+PROFILES: tuple[Profile, ...] = (SMALL, LARGE)
+
+# a broadcast operand always adds this one extra dim (kept small so the
+# cross-product stays cheap while still exercising the broadcast path)
+EXTRA_DIM = "b"
+EXTRA_LEN = 5
+
+
+# --- operand builders (run in setup, never measured) ------------------------
+
+
+def _coords(dims: tuple[str, ...], shape: tuple[int, ...]) -> dict[str, pd.Index]:
+    return {d: pd.RangeIndex(n, name=d) for d, n in zip(dims, shape)}
+
+
+def var(profile: Profile, name: str = "x") -> linopy.Variable:
+    """A variable spanning the profile's dimensions."""
+    m = linopy.Model()
+    return m.add_variables(
+        coords=list(_coords(profile.dims, profile.shape).values()),
+        dims=list(profile.dims),
+        name=name,
+    )
+
+
+def array(profile: Profile) -> xr.DataArray:
+    """A coefficient array matching the profile's dims (the ``match`` case)."""
+    return xr.DataArray(
+        np.linspace(-1.0, 1.0, profile.size).reshape(profile.shape),
+        dims=list(profile.dims),
+        coords=_coords(profile.dims, profile.shape),
+    )
+
+
+def extra_array(_: Profile) -> xr.DataArray:
+    """An array on a *new* dim — broadcasting it introduces that dim (§9)."""
+    return xr.DataArray(
+        np.linspace(1.0, 2.0, EXTRA_LEN),
+        dims=[EXTRA_DIM],
+        coords={EXTRA_DIM: pd.RangeIndex(EXTRA_LEN, name=EXTRA_DIM)},
+    )
+
+
+def extra_var(profile: Profile, name: str = "z") -> linopy.Variable:
+    """A variable on a *new* dim — for var+var broadcast."""
+    m = linopy.Model()
+    return m.add_variables(
+        coords=[pd.RangeIndex(EXTRA_LEN, name=EXTRA_DIM)], dims=[EXTRA_DIM], name=name
+    )
+
+
+def expr(profile: Profile) -> linopy.LinearExpression:
+    """A linear expression spanning the profile's dims (coeffs vary)."""
+    return array(profile) * var(profile)
+
+
+# --- op registry ------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class OpSpec:
+    """One operation benchmark: build operands, then measure ``op(*operands)``."""
+
+    name: str
+    group: str
+    setup: Callable[[Profile], tuple]
+    op: Callable[..., object]
+    profiles: tuple[str, ...] = ("small", "large")
+
+
+OP_REGISTRY: dict[str, OpSpec] = {}
+
+
+def register_op(
+    name: str,
+    group: str,
+    setup: Callable[[Profile], tuple],
+    op: Callable[..., object],
+    profiles: tuple[str, ...] = ("small", "large"),
+) -> None:
+    if name in OP_REGISTRY:
+        raise ValueError(f"op {name!r} already registered")
+    OP_REGISTRY[name] = OpSpec(name, group, setup, op, profiles)
+
+
+def iter_op_params() -> list[tuple[OpSpec, Profile]]:
+    """``(op, profile)`` pairs — the pytest parametrize source."""
+    by_key = {p.key: p for p in PROFILES}
+    return [(op, by_key[key]) for op in OP_REGISTRY.values() for key in op.profiles]
+
+
+# --- the operations ---------------------------------------------------------
+# Binary labelled ops register a `match` and a `broadcast` variant; the
+# alignment case is baked into the operands the setup builds.
+
+# scaling / construction
+register_op("var_mul_scalar", "scale", lambda p: (var(p),), lambda x: 2.0 * x)
+register_op("var_div_scalar", "scale", lambda p: (var(p),), lambda x: x / 2.0)
+register_op("var_neg", "scale", lambda p: (var(p),), lambda x: -x)
+register_op("var_to_linexpr", "scale", lambda p: (var(p),), lambda x: 1 * x)
+register_op(
+    "var_mul_array_match", "scale", lambda p: (var(p), array(p)), lambda x, a: a * x
+)
+register_op(
+    "var_mul_array_bcast",
+    "scale",
+    lambda p: (var(p), extra_array(p)),
+    lambda x, a: a * x,
+)
+
+# variable arithmetic
+register_op("var_add_scalar", "var_arith", lambda p: (var(p),), lambda x: x + 2.0)
+register_op(
+    "var_add_array_match", "var_arith", lambda p: (var(p), array(p)), lambda x, a: x + a
+)
+register_op(
+    "var_add_array_bcast",
+    "var_arith",
+    lambda p: (var(p), extra_array(p)),
+    lambda x, a: x + a,
+)
+register_op(
+    "var_add_var_match",
+    "var_arith",
+    lambda p: (var(p, "x"), var(p, "y")),
+    lambda x, y: x + y,
+)
+register_op(
+    "var_add_var_bcast",
+    "var_arith",
+    lambda p: (var(p, "x"), extra_var(p)),
+    lambda x, z: x + z,
+)
+register_op(
+    "var_sub_var_match",
+    "var_arith",
+    lambda p: (var(p, "x"), var(p, "y")),
+    lambda x, y: x - y,
+)
+
+# quadratic
+register_op(
+    "var_mul_var", "quad", lambda p: (var(p, "x"), var(p, "y")), lambda x, y: x * y
+)
+register_op(
+    "expr_mul_var", "quad", lambda p: (expr(p), var(p, "y")), lambda e, y: e * y
+)
+
+# expression arithmetic
+register_op("expr_add_scalar", "expr_arith", lambda p: (expr(p),), lambda e: e + 2.0)
+register_op(
+    "expr_add_array_match",
+    "expr_arith",
+    lambda p: (expr(p), array(p)),
+    lambda e, a: e + a,
+)
+register_op(
+    "expr_add_array_bcast",
+    "expr_arith",
+    lambda p: (expr(p), extra_array(p)),
+    lambda e, a: e + a,
+)
+register_op(
+    "expr_add_var", "expr_arith", lambda p: (expr(p), var(p, "y")), lambda e, y: e + y
+)
+register_op(
+    "expr_add_expr_match",
+    "expr_arith",
+    lambda p: (expr(p), expr(p)),
+    lambda a, b: a + b,
+)
+register_op(
+    "expr_add_expr_bcast",
+    "expr_arith",
+    lambda p: (expr(p), extra_array(p) * var(p)),
+    lambda a, b: a + b,
+)
+register_op(
+    "expr_sub_expr_match",
+    "expr_arith",
+    lambda p: (expr(p), expr(p)),
+    lambda a, b: a - b,
+)
+register_op("expr_mul_scalar", "expr_arith", lambda p: (expr(p),), lambda e: 2.0 * e)
+register_op(
+    "expr_mul_array_match",
+    "expr_arith",
+    lambda p: (expr(p), array(p)),
+    lambda e, a: a * e,
+)
+register_op(
+    "expr_mul_array_bcast",
+    "expr_arith",
+    lambda p: (expr(p), extra_array(p)),
+    lambda e, a: a * e,
+)
+
+# reductions (sum over d0, present in both profiles)
+register_op("var_sum_dim", "reduce", lambda p: (var(p),), lambda x: x.sum("d0"))
+register_op("expr_sum_dim", "reduce", lambda p: (expr(p),), lambda e: e.sum("d0"))
+register_op("expr_sum_all", "reduce", lambda p: (expr(p),), lambda e: e.sum())
+
+# constraint construction
+register_op("con_le_scalar", "constraint", lambda p: (expr(p),), lambda e: e <= 2.0)
+register_op(
+    "con_le_array", "constraint", lambda p: (expr(p), array(p)), lambda e, a: e <= a
+)
+register_op(
+    "con_eq_expr", "constraint", lambda p: (expr(p), expr(p)), lambda a, b: a == b
+)

From a99bcb484a8d2ebf2724476ffeee4a9a91d1ec6f Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Wed, 1 Jul 2026 15:19:30 +0200
Subject: [PATCH 2/2] bench(ops): single 3-D profile; add masking/groupby/merge
 ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collapse to one 3-D profile (3×4×1000, ~12 K elements) — CodSpeed records time
*and* memory per benchmark, so a second size wasn't buying a separate signal;
one multi-dim profile keeps broadcast/alignment coverage with MB-scale ops above
the noise floor, and halves the matrix. Benchmark ids drop the size suffix.

Add three categories: absence/masking (expr.where / fillna / absence
propagation — §4–§7, the semantics-heavy surface), groupby.sum, and an N-way
merge (constraint-assembly cost). 35 ops, ~45 s/run with memory.
---
 benchmarks/drivers/test_ops.py | 14 ++----
 benchmarks/ops.py              | 83 ++++++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/benchmarks/drivers/test_ops.py b/benchmarks/drivers/test_ops.py
index 774bd5a8..230dd8b7 100644
--- a/benchmarks/drivers/test_ops.py
+++ b/benchmarks/drivers/test_ops.py
@@ -12,16 +12,12 @@
 
 import pytest
 
-from benchmarks.ops import OpSpec, Profile, iter_op_params
+from benchmarks.ops import GRID, OpSpec, iter_ops
 
-_CASES = iter_op_params()
+_OPS = iter_ops()
 
 
-@pytest.mark.parametrize(
-    "op, profile",
-    _CASES,
-    ids=[f"{op.name}[{profile.key}]" for op, profile in _CASES],
-)
-def test_op(benchmark: Callable[..., object], op: OpSpec, profile: Profile) -> None:
-    operands = op.setup(profile)
+@pytest.mark.parametrize("op", _OPS, ids=[op.name for op in _OPS])
+def test_op(benchmark: Callable[..., object], op: OpSpec) -> None:
+    operands = op.setup(GRID)
     benchmark(op.op, *operands)
diff --git a/benchmarks/ops.py b/benchmarks/ops.py
index 483e4315..152ff3ab 100644
--- a/benchmarks/ops.py
+++ b/benchmarks/ops.py
@@ -8,15 +8,16 @@
 (a whole-build benchmark says "kvl got heavier"; an op benchmark says "expr+expr
 broadcast got heavier").
 
-Two axes beyond the op itself:
-
-- **size profile** — ``small`` (1-D, 2000) is the cheap time/shape signal;
-  ``large`` (3-D, 3×4×1000) carries the memory signal and exercises multi-dim
-  broadcast/alignment. They differ in element count *and* dimensionality on
-  purpose; the asymmetric 3×4×1000 also catches dim-order/transpose bugs.
-- **alignment** — for binary labelled ops, ``match`` (identical coords, the fast
-  path) vs ``broadcast`` (an extra dim → §9 cross-product). Both are where the
-  alignment-path regressions live, so they're first-class, not incidental.
+One 3-D size profile (``3×4×1000``, ~12 K elements): multi-dim so it exercises
+broadcast/alignment across dims; ~MB-scale ops sit above the memory-measurement
+noise floor; the asymmetric shape catches dim-order/transpose bugs. CodSpeed
+records time *and* memory on every benchmark, so a second size isn't needed to
+separate the two signals.
+
+The one axis beyond the op itself is **alignment** — for binary labelled ops,
+``match`` (identical coords, the fast path) vs ``broadcast`` (an extra dim → §9
+cross-product). That's where the alignment-path regressions live, so it's
+first-class, not incidental.
 """
 
 from __future__ import annotations
@@ -46,9 +47,7 @@ def size(self) -> int:
         return int(np.prod(self.shape))
 
 
-SMALL = Profile("small", ("d0",), (2000,))
-LARGE = Profile("large", ("d0", "d1", "d2"), (3, 4, 1000))
-PROFILES: tuple[Profile, ...] = (SMALL, LARGE)
+GRID = Profile("grid", ("d0", "d1", "d2"), (3, 4, 1000))
 
 # a broadcast operand always adds this one extra dim (kept small so the
 # cross-product stays cheap while still exercising the broadcast path)
@@ -104,6 +103,27 @@ def expr(profile: Profile) -> linopy.LinearExpression:
     return array(profile) * var(profile)
 
 
+def cond(profile: Profile) -> xr.DataArray:
+    """A boolean mask over the profile's dims (~half the slots)."""
+    return array(profile) > 0.0
+
+
+def masked_expr(profile: Profile) -> linopy.LinearExpression:
+    """An expression carrying absence (§4) — masked in place."""
+    return expr(profile).where(cond(profile))
+
+
+def grouped_expr(profile: Profile) -> linopy.LinearExpression:
+    """An expression with a coarse ``g`` group coord on the last dim (8 groups)."""
+    last, n = profile.dims[-1], profile.shape[-1]
+    g = xr.DataArray(
+        np.arange(n) * 8 // n,
+        dims=[last],
+        coords={last: pd.RangeIndex(n, name=last)},
+    )
+    return expr(profile).assign_coords(g=g)
+
+
 # --- op registry ------------------------------------------------------------
 
 
@@ -115,7 +135,6 @@ class OpSpec:
     group: str
     setup: Callable[[Profile], tuple]
     op: Callable[..., object]
-    profiles: tuple[str, ...] = ("small", "large")
 
 
 OP_REGISTRY: dict[str, OpSpec] = {}
@@ -126,17 +145,15 @@ def register_op(
     group: str,
     setup: Callable[[Profile], tuple],
     op: Callable[..., object],
-    profiles: tuple[str, ...] = ("small", "large"),
 ) -> None:
     if name in OP_REGISTRY:
         raise ValueError(f"op {name!r} already registered")
-    OP_REGISTRY[name] = OpSpec(name, group, setup, op, profiles)
+    OP_REGISTRY[name] = OpSpec(name, group, setup, op)
 
 
-def iter_op_params() -> list[tuple[OpSpec, Profile]]:
-    """``(op, profile)`` pairs — the pytest parametrize source."""
-    by_key = {p.key: p for p in PROFILES}
-    return [(op, by_key[key]) for op in OP_REGISTRY.values() for key in op.profiles]
+def iter_ops() -> list[OpSpec]:
+    """Every registered op — the pytest parametrize source."""
+    return list(OP_REGISTRY.values())
 
 
 # --- the operations ---------------------------------------------------------
@@ -245,7 +262,7 @@ def iter_op_params() -> list[tuple[OpSpec, Profile]]:
     lambda e, a: a * e,
 )
 
-# reductions (sum over d0, present in both profiles)
+# reductions
 register_op("var_sum_dim", "reduce", lambda p: (var(p),), lambda x: x.sum("d0"))
 register_op("expr_sum_dim", "reduce", lambda p: (expr(p),), lambda e: e.sum("d0"))
 register_op("expr_sum_all", "reduce", lambda p: (expr(p),), lambda e: e.sum())
@@ -258,3 +275,29 @@ def iter_op_params() -> list[tuple[OpSpec, Profile]]:
 register_op(
     "con_eq_expr", "constraint", lambda p: (expr(p), expr(p)), lambda a, b: a == b
 )
+
+# absence / masking (§4–§7)
+register_op("expr_where", "mask", lambda p: (expr(p), cond(p)), lambda e, c: e.where(c))
+register_op("expr_fillna", "mask", lambda p: (masked_expr(p),), lambda e: e.fillna(0.0))
+register_op(
+    "expr_add_masked",
+    "mask",
+    lambda p: (expr(p), masked_expr(p)),
+    lambda a, b: a + b,
+)
+
+# groupby
+register_op(
+    "expr_groupby_sum",
+    "groupby",
+    lambda p: (grouped_expr(p),),
+    lambda e: e.groupby("g").sum(),
+)
+
+# N-way assembly (constraint building sums many terms)
+register_op(
+    "merge_sum",
+    "merge",
+    lambda p: tuple(expr(p) for _ in range(8)),
+    lambda *es: sum(es[1:], es[0]),
+)