From 3a607626fbd0aaea5d6c41ba01fad7e56a604b4a Mon Sep 17 00:00:00 2001 From: Aman Srivastava Date: Sat, 4 Jul 2026 22:02:58 +0530 Subject: [PATCH] add DR and plug-in T scoring for CATE evaluation --- causalml/metrics/__init__.py | 6 + causalml/metrics/cate_scoring.py | 473 +++++++++++++++++++++++++++++++ docs/methodology.rst | 51 ++++ docs/refs.bib | 7 + tests/test_cate_scoring.py | 256 +++++++++++++++++ 5 files changed, 793 insertions(+) create mode 100644 causalml/metrics/cate_scoring.py create mode 100644 tests/test_cate_scoring.py diff --git a/causalml/metrics/__init__.py b/causalml/metrics/__init__.py index e382bc43..8de2e025 100644 --- a/causalml/metrics/__init__.py +++ b/causalml/metrics/__init__.py @@ -26,6 +26,12 @@ ) # noqa from .visualize import auuc_score, qini_score # noqa from .rate import get_toc, rate_score, plot_toc # noqa +from .cate_scoring import ( + compute_dr_pseudo_outcomes, + dr_score, + plug_in_t_score, +) # noqa +from .sensitivity import Sensitivity, SensitivityPlaceboTreatment # noqa from .sensitivity import Sensitivity, SensitivityPlaceboTreatment # noqa from .sensitivity import ( SensitivityRandomCause, diff --git a/causalml/metrics/cate_scoring.py b/causalml/metrics/cate_scoring.py new file mode 100644 index 00000000..ffde24d3 --- /dev/null +++ b/causalml/metrics/cate_scoring.py @@ -0,0 +1,473 @@ +from copy import deepcopy +import logging + +import numpy as np +import pandas as pd +from scipy import stats +from sklearn.model_selection import KFold + +from causalml.propensity import compute_propensity_score + +logger = logging.getLogger("causalml") + + +def _resolve_outcome_learners( + learner, control_outcome_learner, treatment_outcome_learner +): + """Resolve control/treatment outcome learners from constructor-style args. + + Mirrors the ``learner`` / ``control_*_learner`` / ``treatment_*_learner`` + convention used throughout ``causalml.inference.meta`` (e.g. ``BaseDRLearner``, + ``BaseTLearner``). + + Args: + learner (model, optional): a model used for both control and treatment + outcome regressions if the group-specific learners are not given + control_outcome_learner (model, optional): a model to estimate outcomes + in the control group + treatment_outcome_learner (model, optional): a model to estimate outcomes + in the treatment group + + Returns: + (tuple): the resolved (control_outcome_learner, treatment_outcome_learner) + """ + if (learner is None) and ( + (control_outcome_learner is None) or (treatment_outcome_learner is None) + ): + raise ValueError( + "Either `learner` or both `control_outcome_learner` and " + "`treatment_outcome_learner` must be specified." + ) + _control_outcome_learner = ( + control_outcome_learner + if control_outcome_learner is not None + else deepcopy(learner) + ) + _treatment_outcome_learner = ( + treatment_outcome_learner + if treatment_outcome_learner is not None + else deepcopy(learner) + ) + return _control_outcome_learner, _treatment_outcome_learner + + +def compute_dr_pseudo_outcomes( + X, + treatment, + y, + p=None, + learner=None, + control_outcome_learner=None, + treatment_outcome_learner=None, + n_folds=5, + p_clip_bounds=(0.02, 0.98), + random_state=None, +): + """Construct cross-fitted doubly-robust (AIPW) pseudo-outcomes for CATE evaluation. + + For each unit i, the pseudo-outcome is + + phi_i = (w_i - e(X_i)) / (e(X_i) * (1 - e(X_i))) * (y_i - mu_w(X_i)) + mu_1(X_i) - mu_0(X_i) + + where ``e`` is the propensity score and ``mu_0``/``mu_1`` are the control/treatment + outcome regressions. Under either correct propensity or correct outcome-model + specification, ``E[phi_i | X_i]`` is an unbiased estimate of the true CATE + ``tau(X_i)`` (Kennedy, 2023), which is why ``phi`` can stand in for the unobserved + ground-truth treatment effect when scoring fitted CATE models. + + Nuisance models (propensity and outcome regressions) are cross-fitted with + ``n_folds``-fold splitting so that ``phi_i`` is always constructed from models + that did not see unit i during training. This is the same doubly-robust + formula used internally by ``BaseDRLearner.fit()``. + + This is a standalone helper so the pseudo-outcomes can be computed once and + reused across multiple scoring calls -- e.g. passed to ``dr_score()`` directly, + or to ``rate_score(..., treatment_effect_col=...)`` for RATE on observational + data -- without re-fitting nuisance models for each. + + Args: + X (numpy.ndarray or pandas.DataFrame): a feature matrix + treatment (numpy.ndarray or pandas.Series): a binary treatment indicator (0 or 1) + y (numpy.ndarray or pandas.Series): an outcome vector + p (numpy.ndarray or pandas.Series, optional): propensity scores. If None, + they are estimated in-fold via ``causalml.propensity.compute_propensity_score`` + (``ElasticNetPropensityModel`` by default) + learner (model, optional): a model used for both control and treatment outcome + regressions if the group-specific learners below are not given + control_outcome_learner (model, optional): a model to estimate outcomes + in the control group + treatment_outcome_learner (model, optional): a model to estimate outcomes + in the treatment group + n_folds (int, optional): number of cross-fitting folds. Default 5. + p_clip_bounds (tuple, optional): lower and upper bounds for clipping + propensity scores before they're used as AIPW weights. The default + ``ElasticNetPropensityModel`` clips to ``(1e-3, 1 - 1e-3)`` internally + for numerical stability of the model itself, but that's too permissive + once the score is *inverted* here: a handful of near-boundary, + cross-fitted propensities (e.g. 0.001, arising from isotonic + calibration on a single fold) can produce AIPW weights in the + hundreds and dominate the mean. Tighter trimming bounds the + variance at the cost of some bias for units with extreme propensity; + (0.02, 0.98) is a reasonable default for that trade-off. Default + (0.02, 0.98). + random_state (int or None, optional): random seed for the fold splitter. + Default None. + + Returns: + (numpy.ndarray): the cross-fitted DR pseudo-outcomes, one per row of ``X`` + """ + _control_outcome_learner, _treatment_outcome_learner = _resolve_outcome_learners( + learner, control_outcome_learner, treatment_outcome_learner + ) + + X = np.asarray(X) + treatment = np.asarray(treatment) + y = np.asarray(y, dtype=float) + p = np.asarray(p, dtype=float) if p is not None else None + + n = X.shape[0] + phi = np.empty(n, dtype=float) + + cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state) + for train_idx, test_idx in cv.split(X): + w_train = treatment[train_idx] + w_test = treatment[test_idx] + + mu_c = deepcopy(_control_outcome_learner) + mu_c.fit(X[train_idx][w_train == 0], y[train_idx][w_train == 0]) + mu_t = deepcopy(_treatment_outcome_learner) + mu_t.fit(X[train_idx][w_train == 1], y[train_idx][w_train == 1]) + + mu_c_pred = mu_c.predict(X[test_idx]) + mu_t_pred = mu_t.predict(X[test_idx]) + + if p is None: + p_test, _ = compute_propensity_score( + X=X[train_idx], + treatment=w_train, + X_pred=X[test_idx], + treatment_pred=w_test, + ) + else: + p_test = p[test_idx] + # Trim regardless of source (estimated or user-supplied): see the + # p_clip_bounds docstring above for why this is tighter than the + # propensity model's own internal clipping. + p_test = np.clip(p_test, p_clip_bounds[0], p_clip_bounds[1]) + + y_test = y[test_idx] + mu_w_pred = np.where(w_test == 1, mu_t_pred, mu_c_pred) + phi[test_idx] = ( + (w_test - p_test) / (p_test * (1 - p_test)) * (y_test - mu_w_pred) + + mu_t_pred + - mu_c_pred + ) + + return phi + + +def _score_against_pseudo_outcome( + df, + pseudo_outcome, + model_cols, + score_name, + return_ci, + n_bootstrap, + alpha, + random_state, +): + """Shared MSE-against-pseudo-outcome scorer with half-sample bootstrap CIs. + + This mirrors the half-sample bootstrap CI/p-value pattern in ``rate.py``'s + ``rate_score()``, but resamples the already-computed (model prediction, + pseudo-outcome) pairs rather than re-fitting any nuisance models -- nuisance + fitting happens once, upstream, in ``compute_dr_pseudo_outcomes()`` or the + plug-in T-learner cross-fit. + + Lower scores are better: this is a loss (mean squared error against a CATE + proxy), not a similarity score. + + Args: + df (pandas.DataFrame): a data frame with model CATE estimates as columns + pseudo_outcome (numpy.ndarray): the CATE proxy to score models against, + one value per row of ``df`` + model_cols (list): the columns of ``df`` holding model CATE estimates + score_name (str): name used for the returned Series/column (e.g. ``"dr_loss"``) + return_ci (bool): whether to return bootstrap confidence intervals and p-values + n_bootstrap (int): number of half-sample bootstrap iterations + alpha (float): significance level for confidence intervals + random_state (int or None): random seed for the bootstrap sampler + + Returns: + If return_ci=False: (pandas.Series): loss for each model column + If return_ci=True: (pandas.DataFrame): loss, standard error, CI bounds, + and p-value (H0: loss = 0) for each model column + """ + sq_err = (df[model_cols].sub(pseudo_outcome, axis=0)) ** 2 + loss = sq_err.mean(axis=0) + loss.name = score_name + + if not return_ci: + return loss + + n = len(df) + m = n // 2 + rng = np.random.default_rng(random_state) + boot_losses = {model: [] for model in model_cols} + for _ in range(n_bootstrap): + idx = rng.choice(n, size=m, replace=False) + boot_loss = sq_err.iloc[idx].mean(axis=0) + for model in model_cols: + boot_losses[model].append(boot_loss[model]) + + z_crit = stats.norm.ppf(1 - alpha / 2) + results = [] + for model in model_cols: + point = loss[model] + boot = np.array(boot_losses[model]) + se = np.std(boot, ddof=1) + ci_lower = point - z_crit * se + ci_upper = point + z_crit * se + z_stat = point / se if se > 0 else np.inf + p_value = 2 * (1 - stats.norm.cdf(abs(z_stat))) + results.append( + { + "model": model, + score_name: point, + "se": se, + "ci_lower": ci_lower, + "ci_upper": ci_upper, + "p_value": p_value, + } + ) + + return pd.DataFrame(results).set_index("model") + + +def dr_score( + df, + X=None, + treatment_col="w", + outcome_col="y", + pseudo_outcome_col=None, + p=None, + learner=None, + control_outcome_learner=None, + treatment_outcome_learner=None, + n_folds=5, + p_clip_bounds=(0.02, 0.98), + return_ci=False, + n_bootstrap=200, + alpha=0.05, + random_state=None, +): + """Score fitted CATE models via the doubly-robust (DR) pseudo-outcome loss. + + Following Kennedy (2023), this constructs cross-fitted AIPW pseudo-outcomes + ``phi`` (see ``compute_dr_pseudo_outcomes()``) and scores each candidate CATE + model by its mean squared error against ``phi``: + + DR loss(tau_hat) = mean((tau_hat(X) - phi) ** 2) + + Lower is better. Unlike held-out outcome MSE, this measures accuracy of the + *treatment effect* estimate rather than the outcome level, without requiring + access to counterfactual outcomes. Mahajan et al. (2024) found DR-based + metrics dominate across 78 benchmark datasets for CATE model selection. + + Pseudo-outcomes can either be supplied directly (via ``pseudo_outcome_col``, + e.g. computed once with ``compute_dr_pseudo_outcomes()`` and reused across + multiple scoring calls or shared with ``rate_score()``) or computed internally + from ``X``, ``treatment_col``, and ``outcome_col``. + + Args: + df (pandas.DataFrame): a data frame with fitted CATE model estimates as + columns, plus either ``pseudo_outcome_col`` or both ``outcome_col`` + and ``treatment_col`` + X (numpy.ndarray or pandas.DataFrame, optional): feature matrix used to + fit the DR nuisance models. Required unless ``pseudo_outcome_col`` is + already present in ``df`` + treatment_col (str, optional): the column name for the treatment + indicator (0 or 1). Ignored if ``pseudo_outcome_col`` is provided + outcome_col (str, optional): the column name for the actual outcome. + Ignored if ``pseudo_outcome_col`` is provided + pseudo_outcome_col (str, optional): the column name of pre-computed DR + pseudo-outcomes (e.g. from ``compute_dr_pseudo_outcomes()``). If given + and present in ``df``, nuisance models are not re-fit + p (numpy.ndarray or pandas.Series, optional): propensity scores. Only + used when pseudo-outcomes are computed internally + learner (model, optional): a model for both control and treatment outcome + regressions if the group-specific learners below are not given. + Required unless ``pseudo_outcome_col`` is provided + control_outcome_learner (model, optional): a model to estimate outcomes + in the control group + treatment_outcome_learner (model, optional): a model to estimate outcomes + in the treatment group + n_folds (int, optional): number of cross-fitting folds for nuisance + estimation. Default 5 + p_clip_bounds (tuple, optional): bounds for clipping propensity scores + used as AIPW weights when pseudo-outcomes are computed internally. + See ``compute_dr_pseudo_outcomes()`` for why this is tighter than a + propensity model's own internal clipping. Ignored if + ``pseudo_outcome_col`` is provided. Default (0.02, 0.98) + return_ci (bool, optional): whether to return bootstrap confidence + intervals and p-values. Default False + n_bootstrap (int, optional): number of half-sample bootstrap iterations. + Only used when return_ci=True. Default 200 + alpha (float, optional): significance level for confidence intervals. + Only used when return_ci=True. Default 0.05 + random_state (int or None, optional): random seed for cross-fitting and + the bootstrap sampler. Default None + + Returns: + If return_ci=False: + (pandas.Series): DR loss for each model column (lower is better) + If return_ci=True: + (pandas.DataFrame): DR loss, standard error, CI bounds, and p-value + (H0: loss = 0) for each model column + """ + have_pseudo_outcome = ( + pseudo_outcome_col is not None and pseudo_outcome_col in df.columns + ) + assert have_pseudo_outcome or (X is not None), ( + "Either `pseudo_outcome_col` (present in df) or `X` " + "(to compute pseudo-outcomes internally) must be provided." + ) + + model_cols = [ + c + for c in df.columns + if c not in (outcome_col, treatment_col, pseudo_outcome_col) + ] + + if have_pseudo_outcome: + pseudo_outcome = df[pseudo_outcome_col].to_numpy() + else: + assert ( + outcome_col in df.columns and treatment_col in df.columns + ), "{} and {} must be present in df to compute DR pseudo-outcomes.".format( + outcome_col, treatment_col + ) + pseudo_outcome = compute_dr_pseudo_outcomes( + X=X, + treatment=df[treatment_col], + y=df[outcome_col], + p=p, + learner=learner, + control_outcome_learner=control_outcome_learner, + treatment_outcome_learner=treatment_outcome_learner, + n_folds=n_folds, + p_clip_bounds=p_clip_bounds, + random_state=random_state, + ) + + return _score_against_pseudo_outcome( + df=df, + pseudo_outcome=pseudo_outcome, + model_cols=model_cols, + score_name="dr_loss", + return_ci=return_ci, + n_bootstrap=n_bootstrap, + alpha=alpha, + random_state=random_state, + ) + + +def plug_in_t_score( + df, + X, + treatment_col="w", + outcome_col="y", + learner=None, + control_outcome_learner=None, + treatment_outcome_learner=None, + n_folds=5, + return_ci=False, + n_bootstrap=200, + alpha=0.05, + random_state=None, +): + """Score fitted CATE models against a cross-fitted plug-in T-learner proxy. + + Fits a simple T-learner -- separate control/treatment outcome regressions -- + with ``n_folds``-fold cross-fitting, and uses ``mu_1(X) - mu_0(X)`` on each + held-out fold as a proxy for the true CATE. Candidate models are then scored + by mean squared error against this proxy: + + T-loss(tau_hat) = mean((tau_hat(X) - (mu_1(X) - mu_0(X))) ** 2) + + Lower is better. This is a simpler baseline than ``dr_score()`` -- it isn't + doubly robust and is biased under a misspecified outcome model -- but Mahajan + et al. (2024) found it is never dominated across their benchmark datasets + despite its simplicity, making it a useful complement to DR-based scoring + rather than a replacement. + + Args: + df (pandas.DataFrame): a data frame with fitted CATE model estimates as columns + X (numpy.ndarray or pandas.DataFrame): feature matrix used to fit the + plug-in T-learner nuisance models + treatment_col (str, optional): the column name for the treatment + indicator (0 or 1) + outcome_col (str, optional): the column name for the actual outcome + learner (model, optional): a model for both control and treatment outcome + regressions if the group-specific learners below are not given + control_outcome_learner (model, optional): a model to estimate outcomes + in the control group + treatment_outcome_learner (model, optional): a model to estimate outcomes + in the treatment group + n_folds (int, optional): number of cross-fitting folds. Default 5 + return_ci (bool, optional): whether to return bootstrap confidence + intervals and p-values. Default False + n_bootstrap (int, optional): number of half-sample bootstrap iterations. + Only used when return_ci=True. Default 200 + alpha (float, optional): significance level for confidence intervals. + Only used when return_ci=True. Default 0.05 + random_state (int or None, optional): random seed for cross-fitting and + the bootstrap sampler. Default None + + Returns: + If return_ci=False: + (pandas.Series): plug-in T-learner loss for each model column (lower is better) + If return_ci=True: + (pandas.DataFrame): loss, standard error, CI bounds, and p-value + (H0: loss = 0) for each model column + """ + assert ( + outcome_col in df.columns and treatment_col in df.columns + ), "{} and {} must be present in df.".format(outcome_col, treatment_col) + + _control_outcome_learner, _treatment_outcome_learner = _resolve_outcome_learners( + learner, control_outcome_learner, treatment_outcome_learner + ) + + X_arr = np.asarray(X) + treatment = df[treatment_col].to_numpy() + y = df[outcome_col].to_numpy(dtype=float) + + n = X_arr.shape[0] + tau_proxy = np.empty(n, dtype=float) + + cv = KFold(n_splits=n_folds, shuffle=True, random_state=random_state) + for train_idx, test_idx in cv.split(X_arr): + w_train = treatment[train_idx] + + mu_c = deepcopy(_control_outcome_learner) + mu_c.fit(X_arr[train_idx][w_train == 0], y[train_idx][w_train == 0]) + mu_t = deepcopy(_treatment_outcome_learner) + mu_t.fit(X_arr[train_idx][w_train == 1], y[train_idx][w_train == 1]) + + tau_proxy[test_idx] = mu_t.predict(X_arr[test_idx]) - mu_c.predict( + X_arr[test_idx] + ) + + model_cols = [c for c in df.columns if c not in (outcome_col, treatment_col)] + + return _score_against_pseudo_outcome( + df=df, + pseudo_outcome=tau_proxy, + model_cols=model_cols, + score_name="plug_in_t_loss", + return_ci=return_ci, + n_bootstrap=n_bootstrap, + alpha=alpha, + random_state=random_state, + ) diff --git a/docs/methodology.rst b/docs/methodology.rst index 735e79a2..bab04955 100755 --- a/docs/methodology.rst +++ b/docs/methodology.rst @@ -194,6 +194,57 @@ with :math:`\{Y^3, X^3, W^3\}` Similar to the DR-Learner Repeat Stage 1 and Stage 2 again twice with different permutations of partitions for estimation. The final conditional LATE model is the average of the 3 conditional LATE models. +CATE Model Evaluation +---------------------- + +Cross-validation cannot be used to select among fitted CATE models because the +ground-truth treatment effect is never observed for any unit -- only one of the +two potential outcomes is realized. The methods below are surrogate metrics that +score fitted CATE models using only observed data, so that models such as the +S/T/X/R/DR-learners above can be compared and selected among. + +These metrics evaluate *effect-magnitude accuracy* -- how close a model's CATE +estimate is to the truth -- which is a distinct question from *targeting +quality* -- whether a model correctly ranks units by benefit -- addressed by +:ref:`RATE`. + +DR (Doubly Robust) pseudo-outcome loss +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Following :cite:`kennedy2020optimal`, we construct the same doubly-robust +pseudo-outcome used in the DR-learner above, + +.. math:: + \phi = \frac{W-\hat{e}(X)}{\hat{e}(X)(1-\hat{e}(X))}\left(Y-\hat{m}_W(X)\right)+\hat{m}_1(X)-\hat{m}_0(X) + +with :math:`\hat{e}(x)`, :math:`\hat{m}_0(x)`, and :math:`\hat{m}_1(x)` cross-fit +so that :math:`\phi_i` never depends on a model that saw unit :math:`i` during +training. Under either correct propensity or correct outcome-model +specification, :math:`E[\phi \mid X]` is an unbiased estimate of the true CATE, +which lets :math:`\phi` stand in for the unobserved ground truth when scoring a +fitted CATE model :math:`\hat\tau(x)`: + +.. math:: + \text{DR loss}(\hat\tau) = \frac{1}{n}\sum_i \left(\hat\tau(X_i) - \phi_i\right)^2 + +Lower is better. :cite:`mahajan2024empirical` found DR-based metrics dominate +across 78 benchmark datasets for CATE model selection. + +Plug-in T-learner loss +~~~~~~~~~~~~~~~~~~~~~~ + +A simpler, non-doubly-robust baseline: fit a cross-fit :ref:`T-learner`, and use +its held-out estimate :math:`\hat\mu_1(x) - \hat\mu_0(x)` directly as a proxy +for the true CATE: + +.. math:: + \text{T-loss}(\hat\tau) = \frac{1}{n}\sum_i \left(\hat\tau(X_i) - (\hat\mu_1(X_i) - \hat\mu_0(X_i))\right)^2 + +This is biased under a misspecified outcome model, unlike the DR loss above. +Despite that, :cite:`mahajan2024empirical` found it is never dominated across +their benchmark datasets, making it a useful complement to DR-based scoring +rather than a replacement. + Tree-Based Algorithms --------------------- diff --git a/docs/refs.bib b/docs/refs.bib index 8b0c13d1..096c0d0e 100755 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -468,6 +468,13 @@ @misc{kennedy2020optimal primaryClass={math.ST} } +@inproceedings{mahajan2024empirical, + title={Empirical Analysis of Model Selection for Heterogeneous Causal Effect Estimation}, + author={Mahajan, Divyat and Mitliagkas, Ioannis and Neal, Brady and Syrgkanis, Vasilis}, + booktitle={International Conference on Learning Representations}, + year={2024} +} + @article{10.1111/ectj.12097, author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James}, title = "{Double/debiased machine learning for treatment and structural parameters}", diff --git a/tests/test_cate_scoring.py b/tests/test_cate_scoring.py new file mode 100644 index 00000000..55a924d0 --- /dev/null +++ b/tests/test_cate_scoring.py @@ -0,0 +1,256 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression + +from causalml.metrics.cate_scoring import ( + compute_dr_pseudo_outcomes, + dr_score, + plug_in_t_score, +) +from causalml.metrics.rate import rate_score + +try: + from tests.const import RANDOM_SEED +except ImportError: + RANDOM_SEED = 42 + + +@pytest.fixture +def synthetic_data(): + rng = np.random.default_rng(RANDOM_SEED) + n, p = 2000, 5 + X = rng.normal(0, 1, (n, p)) + tau = 1.0 + X[:, 0] + e = np.clip(0.5 + 0.1 * X[:, 1], 0.1, 0.9) + w = rng.binomial(1, e) + mu0 = X[:, 2] + y = mu0 + w * tau + rng.normal(0, 0.5, n) + + perfect_model = tau + noisy_model = tau + rng.normal(0, 1.0, n) + bad_model = rng.normal(0, 1, n) + + df = pd.DataFrame( + { + "y": y, + "w": w, + "perfect_model": perfect_model, + "noisy_model": noisy_model, + "bad_model": bad_model, + } + ) + return df, X + + +def test_compute_dr_pseudo_outcomes_shape(synthetic_data): + df, X = synthetic_data + phi = compute_dr_pseudo_outcomes( + X, + df["w"], + df["y"], + learner=LinearRegression(), + n_folds=5, + random_state=RANDOM_SEED, + ) + assert phi.shape == (len(df),) + assert np.isfinite(phi).all() + + +def test_compute_dr_pseudo_outcomes_recovers_ate(synthetic_data): + df, X = synthetic_data + phi = compute_dr_pseudo_outcomes( + X, + df["w"], + df["y"], + learner=LinearRegression(), + n_folds=5, + random_state=RANDOM_SEED, + ) + # With a well-specified outcome model the DR pseudo-outcome mean should + # recover the true ATE. Tolerance is loose (not ~0.05) because AIPW is a + # high-variance functional: a handful of cross-fitted, near-boundary + # propensities inflate the mean even after p_clip_bounds trimming. + true_ate = (1.0 + X[:, 0]).mean() + assert abs(phi.mean() - true_ate) < 0.75 + + +def test_dr_score_returns_series(synthetic_data): + df, X = synthetic_data + scores = dr_score( + df, + X=X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + assert isinstance(scores, pd.Series) + assert set(scores.index) == {"perfect_model", "noisy_model", "bad_model"} + + +def test_dr_score_ranks_perfect_model_lowest(synthetic_data): + df, X = synthetic_data + scores = dr_score( + df, + X=X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + assert scores["perfect_model"] < scores["noisy_model"] < scores["bad_model"] + + +def test_dr_score_with_precomputed_pseudo_outcomes(synthetic_data): + df, X = synthetic_data + phi = compute_dr_pseudo_outcomes( + X, + df["w"], + df["y"], + learner=LinearRegression(), + n_folds=5, + random_state=RANDOM_SEED, + ) + df_with_phi = df.assign(dr_pseudo_outcome=phi) + scores = dr_score( + df_with_phi, pseudo_outcome_col="dr_pseudo_outcome", random_state=RANDOM_SEED + ) + assert scores["perfect_model"] < scores["bad_model"] + + +def test_dr_score_shares_pseudo_outcomes_with_rate_score(synthetic_data): + # The same pseudo-outcomes should be usable both for dr_score() and as + # rate_score()'s treatment_effect_col, without re-fitting nuisances twice. + df, X = synthetic_data + phi = compute_dr_pseudo_outcomes( + X, + df["w"], + df["y"], + learner=LinearRegression(), + n_folds=5, + random_state=RANDOM_SEED, + ) + df_with_phi = df.assign(tau=phi) + rate = rate_score(df_with_phi.drop(columns=["w"]), treatment_effect_col="tau") + dr = dr_score( + df_with_phi.rename(columns={"tau": "dr_pseudo_outcome"}), + pseudo_outcome_col="dr_pseudo_outcome", + ) + assert rate["perfect_model"] > 0 + assert dr["perfect_model"] < dr["bad_model"] + + +def test_dr_score_missing_X_and_pseudo_outcome_raises(synthetic_data): + df, _ = synthetic_data + with pytest.raises(AssertionError): + dr_score(df, treatment_col="w", outcome_col="y") + + +def test_dr_score_return_ci_returns_dataframe(synthetic_data): + df, X = synthetic_data + result = dr_score( + df, + X=X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert isinstance(result, pd.DataFrame) + assert set(result.columns) == {"dr_loss", "se", "ci_lower", "ci_upper", "p_value"} + + +def test_dr_score_ci_bounds_ordered(synthetic_data): + df, X = synthetic_data + result = dr_score( + df, + X=X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert (result["ci_lower"] < result["dr_loss"]).all() + assert (result["ci_upper"] > result["dr_loss"]).all() + + +def test_plug_in_t_score_returns_series(synthetic_data): + df, X = synthetic_data + scores = plug_in_t_score( + df, + X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + assert isinstance(scores, pd.Series) + assert set(scores.index) == {"perfect_model", "noisy_model", "bad_model"} + + +def test_plug_in_t_score_ranks_perfect_model_lowest(synthetic_data): + df, X = synthetic_data + scores = plug_in_t_score( + df, + X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + assert scores["perfect_model"] < scores["noisy_model"] < scores["bad_model"] + + +def test_plug_in_t_score_return_ci_returns_dataframe(synthetic_data): + df, X = synthetic_data + result = plug_in_t_score( + df, + X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + return_ci=True, + n_bootstrap=50, + random_state=RANDOM_SEED, + ) + assert isinstance(result, pd.DataFrame) + assert set(result.columns) == { + "plug_in_t_loss", + "se", + "ci_lower", + "ci_upper", + "p_value", + } + + +def test_plug_in_t_score_missing_columns_raises(synthetic_data): + df, X = synthetic_data + with pytest.raises(AssertionError): + plug_in_t_score(df.drop(columns=["w"]), X, treatment_col="w", outcome_col="y") + + +def test_dr_score_and_plug_in_t_score_all_finite(synthetic_data): + df, X = synthetic_data + dr = dr_score( + df, + X=X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + t = plug_in_t_score( + df, + X, + treatment_col="w", + outcome_col="y", + learner=LinearRegression(), + random_state=RANDOM_SEED, + ) + assert np.isfinite(dr.values).all() + assert np.isfinite(t.values).all()