diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 80f9076e..2981c9dd 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -19,6 +19,7 @@ Upcoming Version *Other* +* Default internal integer labels to ``int32`` (configurable via ``linopy.options["label_dtype"]``, set to ``np.int64`` for the old behavior), cutting memory ~25% and speeding up model build 10-35%. Raises ``ValueError`` if labels exceed the int32 maximum. * ``add_variables(binary=True, ...)`` now accepts ``lower``/``upper`` bounds, as long as they are 0 or 1. Previously binary bounds could only be set via the ``.lower``/``.upper`` setters after creation. (https://github.com/PyPSA/linopy/issues/776) * ``add_piecewise_formulation`` gained an ``active_fill`` parameter that gates a partial ``active`` (defined over a subset of the indexed dimension, or masked) as always-active (``1``) or always-off (``0``); without it, a partial ``active`` — which was previously zeroed silently — now raises. Useful when one formulation mixes gated and ungated entities (e.g. committable and non-committable units sharing a ``status``). ``active_fill`` is transitional and will be removed once v1 semantics make ``active.reindex(coords).fillna(value)`` sufficient. (https://github.com/PyPSA/linopy/issues/796) diff --git a/linopy/common.py b/linopy/common.py index 9ee9777d..2c45c999 100644 --- a/linopy/common.py +++ b/linopy/common.py @@ -8,7 +8,6 @@ from __future__ import annotations import operator -import os from collections.abc import Callable, Generator, Hashable, Iterable, Sequence from functools import cached_property, reduce, wraps from pathlib import Path @@ -159,12 +158,10 @@ def infer_schema_polars(ds: Dataset) -> dict[str, DataTypeClass]: dict: A dictionary mapping column names to their corresponding Polars data types. """ schema: dict[str, DataTypeClass] = {} - np_major_version = int(np.__version__.split(".")[0]) - use_int32 = os.name == "nt" and np_major_version < 2 for name, array in ds.items(): name = str(name) if np.issubdtype(array.dtype, np.integer): - schema[name] = pl.Int32 if use_int32 else pl.Int64 + schema[name] = pl.Int32 if array.dtype.itemsize <= 4 else pl.Int64 elif np.issubdtype(array.dtype, np.floating): schema[name] = pl.Float64 elif np.issubdtype(array.dtype, np.bool_): @@ -308,7 +305,7 @@ def save_join(*dataarrays: DataArray, integer_dtype: bool = False) -> Dataset: ) arrs = xr_align(*dataarrays, join="outer") if integer_dtype: - arrs = tuple([ds.fillna(-1).astype(int) for ds in arrs]) + arrs = tuple([ds.fillna(-1).astype(options["label_dtype"]) for ds in arrs]) return Dataset({ds.name: ds for ds in arrs}) diff --git a/linopy/config.py b/linopy/config.py index 5d269c4e..6a28d43f 100644 --- a/linopy/config.py +++ b/linopy/config.py @@ -9,6 +9,10 @@ from typing import Any +import numpy as np + +_VALID_LABEL_DTYPES = {np.int32, np.int64} + class OptionSettings: """Runtime configuration knobs (e.g. display widths). Use as a context manager or set values directly via ``options(key=value)``.""" @@ -30,6 +34,10 @@ def set_value(self, **kwargs: Any) -> None: for k, v in kwargs.items(): if k not in self._defaults: raise KeyError(f"{k} is not a valid setting.") + if k == "label_dtype" and v not in _VALID_LABEL_DTYPES: + raise ValueError( + f"label_dtype must be one of {_VALID_LABEL_DTYPES}, got {v}" + ) self._current_values[k] = v def get_value(self, name: str) -> Any: @@ -62,4 +70,5 @@ def __repr__(self) -> str: options = OptionSettings( display_max_rows=14, display_max_terms=6, + label_dtype=np.int32, ) diff --git a/linopy/constraints.py b/linopy/constraints.py index 0b9dbb0a..45b2fa34 100644 --- a/linopy/constraints.py +++ b/linopy/constraints.py @@ -748,7 +748,7 @@ def _to_dataset(self, nterm: int) -> Dataset: # Map active row i -> flat position in full shape via con_labels active_positions = self.active_positions coeffs_2d = np.zeros((full_size, nterm), dtype=csr.dtype) - vars_2d = np.full((full_size, nterm), -1, dtype=np.int64) + vars_2d = np.full((full_size, nterm), -1, dtype=options["label_dtype"]) if csr.nnz > 0: row_indices = np.repeat(active_positions, counts) term_cols = np.arange(csr.nnz) - np.repeat(csr.indptr[:-1], counts) @@ -772,7 +772,7 @@ def _to_dataset(self, nterm: int) -> Dataset: ) ds = Dataset({"coeffs": coeffs_da, "vars": vars_da}) if self._cindex is not None: - labels_flat = np.full(full_size, -1, dtype=np.int64) + labels_flat = np.full(full_size, -1, dtype=options["label_dtype"]) labels_flat[active_positions] = self._con_labels ds = assign_multiindex_safe( ds, @@ -2181,7 +2181,10 @@ def flat(self) -> pd.DataFrame: return pd.DataFrame(columns=["coeffs", "vars", "labels", "key"]) df = pd.concat(dfs, ignore_index=True) unique_labels = df.labels.unique() - map_labels = pd.Series(np.arange(len(unique_labels)), index=unique_labels) + map_labels = pd.Series( + np.arange(len(unique_labels), dtype=options["label_dtype"]), + index=unique_labels, + ) df["key"] = df.labels.map(map_labels) return df diff --git a/linopy/expressions.py b/linopy/expressions.py index ea8588d2..c59f05d7 100644 --- a/linopy/expressions.py +++ b/linopy/expressions.py @@ -451,7 +451,9 @@ def __init__(self, data: Dataset | Any | None, model: Model) -> None: ) if np.issubdtype(data.vars, np.floating): - data = assign_multiindex_safe(data, vars=data.vars.fillna(-1).astype(int)) + data = assign_multiindex_safe( + data, vars=data.vars.fillna(-1).astype(options["label_dtype"]) + ) if not np.issubdtype(data.coeffs, np.floating): data["coeffs"].values = data.coeffs.values.astype(float) @@ -1535,7 +1537,7 @@ def sanitize(self) -> Self: linopy.LinearExpression """ if not np.issubdtype(self.vars.dtype, np.integer): - return self.assign(vars=self.vars.fillna(-1).astype(int)) + return self.assign(vars=self.vars.fillna(-1).astype(options["label_dtype"])) return self @@ -1939,12 +1941,12 @@ def _simplify_row(vars_row: np.ndarray, coeffs_row: np.ndarray) -> np.ndarray: # Combined has dimensions (.., CV_DIM, TERM_DIM) # Drop terms where all vars are -1 (i.e., empty terms across all coordinates) - vars = combined.isel({CV_DIM: 0}).astype(int) + vars = combined.isel({CV_DIM: 0}).astype(options["label_dtype"]) non_empty_terms = (vars != -1).any(dim=[d for d in vars.dims if d != TERM_DIM]) combined = combined.isel({TERM_DIM: non_empty_terms}) # Extract vars and coeffs from the combined result - vars = combined.isel({CV_DIM: 0}).astype(int) + vars = combined.isel({CV_DIM: 0}).astype(options["label_dtype"]) coeffs = combined.isel({CV_DIM: 1}) # Create new dataset with simplified data diff --git a/linopy/model.py b/linopy/model.py index de5c089f..b1477b27 100644 --- a/linopy/model.py +++ b/linopy/model.py @@ -35,6 +35,7 @@ replace_by_map, to_path, ) +from linopy.config import options from linopy.constants import ( GREATER_EQUAL, HELPER_DIMS, @@ -824,7 +825,15 @@ def add_variables( start = self._xCounter end = start + data.labels.size - data.labels.values = np.arange(start, end).reshape(data.labels.shape) + label_dtype = options["label_dtype"] + if end > np.iinfo(label_dtype).max: + raise ValueError( + f"Number of labels ({end}) exceeds the maximum value for " + f"{label_dtype.__name__} ({np.iinfo(label_dtype).max})." + ) + data.labels.values = np.arange( + start, end, dtype=options["label_dtype"] + ).reshape(data.labels.shape) self._xCounter += data.labels.size if mask is not None: @@ -969,7 +978,15 @@ def _allocate_constraint_labels( """Assign label ranges from the constraint counter and apply an optional mask.""" start = self._cCounter end = start + data.labels.size - data.labels.values = np.arange(start, end).reshape(data.labels.shape) + label_dtype = options["label_dtype"] + if end > np.iinfo(label_dtype).max: + raise ValueError( + f"Number of labels ({end}) exceeds the maximum value for " + f"{label_dtype.__name__} ({np.iinfo(label_dtype).max})." + ) + data.labels.values = np.arange(start, end, dtype=label_dtype).reshape( + data.labels.shape + ) self._cCounter += data.labels.size if mask is not None: data.labels.values = np.where(mask.values, data.labels.values, -1) diff --git a/linopy/variables.py b/linopy/variables.py index 0eed704d..0e87aec1 100644 --- a/linopy/variables.py +++ b/linopy/variables.py @@ -1272,7 +1272,9 @@ def ffill(self, dim: str, limit: None = None) -> Variable: .map(DataArray.ffill, dim=dim, limit=limit) .fillna(self._fill_value) ) - return self.assign_multiindex_safe(labels=data.labels.astype(int)) + return self.assign_multiindex_safe( + labels=data.labels.astype(options["label_dtype"]) + ) def bfill(self, dim: str, limit: None = None) -> Variable: """ @@ -1299,7 +1301,7 @@ def bfill(self, dim: str, limit: None = None) -> Variable: .map(DataArray.bfill, dim=dim, limit=limit) .fillna(self._fill_value) ) - return self.assign(labels=data.labels.astype(int)) + return self.assign(labels=data.labels.astype(options["label_dtype"])) def sanitize(self) -> Variable: """ @@ -1310,7 +1312,9 @@ def sanitize(self) -> Variable: linopy.Variable """ if issubdtype(self.labels.dtype, floating): - return self.assign(labels=self.labels.fillna(-1).astype(int)) + return self.assign( + labels=self.labels.fillna(-1).astype(options["label_dtype"]) + ) return self def equals(self, other: Variable) -> bool: @@ -2032,7 +2036,10 @@ def flat(self) -> pd.DataFrame: """ df = pd.concat([self[k].flat for k in self], ignore_index=True) unique_labels = df.labels.unique() - map_labels = pd.Series(np.arange(len(unique_labels)), index=unique_labels) + map_labels = pd.Series( + np.arange(len(unique_labels), dtype=options["label_dtype"]), + index=unique_labels, + ) df["key"] = df.labels.map(map_labels) return df diff --git a/test/test_dtypes.py b/test/test_dtypes.py new file mode 100644 index 00000000..b30c7eac --- /dev/null +++ b/test/test_dtypes.py @@ -0,0 +1,75 @@ +"""Tests for int32 default label dtype.""" + +import numpy as np +import pytest + +from linopy import Model +from linopy.config import options + + +def test_default_label_dtype_is_int32() -> None: + assert options["label_dtype"] == np.int32 + + +def test_variable_labels_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + assert x.labels.dtype == np.int32 + + +def test_constraint_labels_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + m.add_constraints(x >= 1, name="c") + assert m.constraints["c"].labels.dtype == np.int32 + + +def test_expression_vars_are_int32() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + expr = 2 * x + 1 + assert expr.vars.dtype == np.int32 + + +@pytest.mark.skipif( + not pytest.importorskip("highspy", reason="highspy not installed"), + reason="highspy not installed", +) +def test_solve_with_int32_labels() -> None: + m = Model() + x = m.add_variables(lower=0, upper=10, name="x") + y = m.add_variables(lower=0, upper=10, name="y") + m.add_constraints(x + y <= 15, name="c1") + m.add_objective(x + 2 * y, sense="max") + m.solve("highs") + assert m.objective.value == pytest.approx(25.0) + + +def test_overflow_guard_variables() -> None: + m = Model() + m._xCounter = np.iinfo(np.int32).max - 1 + with pytest.raises(ValueError, match="exceeds the maximum"): + m.add_variables(lower=0, upper=1, coords=[range(5)], name="x") + + +def test_overflow_guard_constraints() -> None: + m = Model() + x = m.add_variables(lower=0, upper=1, coords=[range(5)], name="x") + m._cCounter = np.iinfo(np.int32).max - 1 + with pytest.raises(ValueError, match="exceeds the maximum"): + m.add_constraints(x >= 0, name="c") + + +def test_label_dtype_option_int64() -> None: + with options: + options["label_dtype"] = np.int64 + m = Model() + x = m.add_variables(lower=0, upper=10, coords=[range(5)], name="x") + assert x.labels.dtype == np.int64 + expr = 2 * x + 1 + assert expr.vars.dtype == np.int64 + + +def test_label_dtype_rejects_invalid() -> None: + with pytest.raises(ValueError, match="label_dtype must be one of"): + options["label_dtype"] = np.float64