Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions pertpy/tools/_perturbation_space/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,16 @@ def compute(
ps_adata.X = ps_adata.layers[mode]

missing_cols = [col for col in original_obs.columns if col not in ps_adata.obs.columns]
new_cols_data = {}

for col in missing_cols:
grouped_values = original_obs.groupby(grouping_cols, observed=False)[col].first()
new_cols_data[col] = grouped_values.reindex(ps_adata.obs.index).values

if new_cols_data:
ps_adata.obs = pd.concat([ps_adata.obs, pd.DataFrame(new_cols_data, index=ps_adata.obs.index)], axis=1)
if missing_cols:
grouped = original_obs.groupby(grouping_cols, observed=False)[missing_cols].first()
if len(grouping_cols) == 1:
index = pd.Index(ps_adata.obs[grouping_cols[0]])
else:
index = pd.MultiIndex.from_frame(ps_adata.obs[grouping_cols])
grouped = grouped.reindex(index)
grouped.index = ps_adata.obs.index
ps_adata.obs = pd.concat([ps_adata.obs, grouped], axis=1)

ps_adata.obs[target_col] = ps_adata.obs[target_col].astype("category")

Expand Down
38 changes: 38 additions & 0 deletions tests/tools/_perturbation_space/test_simple_perturbation_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,44 @@ def test_pseudobulk_response(adata_simple):
)


def test_pseudobulk_preserves_extra_obs_with_and_without_groups_col(rng):
"""Regression test for https://github.com/scverse/pertpy/issues/1003.

When `groups_col` is provided, the pseudobulk output's obs index is joined (e.g. "P0_C0"),
so reindexing extra obs columns must use the grouping columns as keys, not the joined index.
Otherwise every extra column ends up all-NaN and downstream tools (e.g. PyDESeq2) fail.
"""
patients = [f"P{i}" for i in range(4)]
clusters = [f"C{i}" for i in range(2)]
efficacy_per_patient = {"P0": "SD", "P1": "PR", "P2": "PD", "P3": "SD"}
n_cells = 200
patient_choice = rng.choice(patients, size=n_cells)
cluster_choice = rng.choice(clusters, size=n_cells)
obs = pd.DataFrame(
{
"Patient": pd.Categorical(patient_choice, categories=patients),
"Cluster": pd.Categorical(cluster_choice, categories=clusters),
"Efficacy": pd.Categorical(
[efficacy_per_patient[p] for p in patient_choice], categories=["SD", "PR", "PD"]
),
}
)
X = rng.poisson(5, size=(n_cells, 10)).astype(float)
adata = AnnData(X=X, obs=obs)

ps = pt.tl.PseudobulkSpace()
pdata = ps.compute(adata, target_col="Patient", groups_col="Cluster", mode="sum")
pdata2 = ps.compute(adata, target_col="Patient", mode="sum")

assert pdata.obs["Efficacy"].isna().sum() == 0
for row in pdata.obs.itertuples():
assert row.Efficacy == efficacy_per_patient[row.Patient]

assert pdata2.obs["Efficacy"].isna().sum() == 0
for row in pdata2.obs.itertuples():
assert row.Efficacy == efficacy_per_patient[row.Patient]


def test_centroid_umap_response():
X = np.zeros((10, 5))

Expand Down
Loading