From 3c584bbac0a6847e0e0a49205f270fcc00afe61f Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 22:21:33 -0400
Subject: [PATCH 01/19] Add separate function for dataset status internal

---
 R/datasets.R | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/R/datasets.R b/R/datasets.R
index f4e24a8..9b9fcf6 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -218,6 +218,27 @@ get_dataset_detail <- function(dataset, auth_token) {
     resp_body_json()
 }
 
+#' Map dataset detail status flags to a status string
+#'
+#' @param detail the dataset detail list returned by [get_dataset_detail()]
+#'
+#' @keywords internal
+#'
+#' @returns a single character string: one of "pending", "processing",
+#'   "succeeded", "failed", or "expired"
+dataset_status_from_detail <- function(detail) {
+  if (isTRUE(detail$is_failed)) {
+    "failed"
+  } else if (isTRUE(detail$is_expired)) {
+    "expired"
+  } else if (isTRUE(detail$is_succeeded)) {
+    "succeeded"
+  } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
+    "processing"
+  } else {
+    "pending"
+  }
+}
 
 #' Get the processing status of a custom dataset
 #'

From ab1e3835f372014b1b476c00b3876d744b5ce90c Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 22:22:20 -0400
Subject: [PATCH 02/19] use new dataset status

---
 R/datasets.R | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 9b9fcf6..835621e 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -267,22 +267,12 @@ dataset_status_from_detail <- function(detail) {
 #'
 #' @examples
 #' \dontrun{
-#' get_dataset_status(ds)
+#' get_dataset_status(ds_id)
 #' }
 get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
   auth_token <- resolve_auth_token(auth_token)
   detail <- get_dataset_detail(dataset, auth_token)
-  if (isTRUE(detail$is_failed)) {
-    "failed"
-  } else if (isTRUE(detail$is_expired)) {
-    "expired"
-  } else if (isTRUE(detail$is_succeeded)) {
-    "succeeded"
-  } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
-    "processing"
-  } else {
-    "pending"
-  }
+  dataset_status_from_detail(detail)
 }
 
 

From b0a136422b2fc525d83074112c60d23d97ddec81 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 22:28:42 -0400
Subject: [PATCH 03/19] Use only the dataset id for external return values.

---
 R/datasets.R  | 78 +++++++++++++++++++++++++++------------------------
 R/downloads.R |  5 ++--
 2 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 835621e..fe35d3f 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -47,15 +47,16 @@ build_dataset_data <- function(samples = NULL, projects = NULL, include_bulk = F
 
 #' Resolve a dataset identifier to its ID string
 #'
-#' Accepts either a dataset UUID string or a list with an `$id` element (such as
-#' the return value of [create_dataset()] or [get_dataset_detail()]) and returns
-#' the ID string, after checking that it is a valid UUID.
+#' Accepts either a dataset UUID string (such as the value returned by
+#' [create_dataset()]) or a list with an `$id` element (such as the value returned
+#' by [get_dataset_detail()]) and returns the ID string, after checking that it is
+#' a valid UUID.
 #'
 #' @param dataset a dataset UUID string, or a list with an `$id` element
 #'
 #' @keywords internal
 #'
-#' @returns the dataset ID as a length-1 character string
+#' @returns the dataset ID as a character string
 resolve_dataset_id <- function(dataset) {
   if (is.list(dataset)) {
     stopifnot("dataset must be an id string or contain an $id element" = !is.null(dataset$id))
@@ -112,7 +113,9 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' Create a custom dataset on the ScPCA Portal
 #'
 #' Creates a new user dataset without starting processing.
-#' The returned list includes the dataset `$id` along with its current contents and status.
+#' Returns the new dataset's ID (invisibly), which you can pass to the other
+#' dataset functions such as [get_dataset_info()], [add_dataset_samples()], and
+#' [start_dataset_processing()].
 #'
 #' @param samples optional character vector of ScPCA sample IDs (e.g. "SCPCS000001")
 #' @param projects optional character vector of ScPCA project IDs (e.g. "SCPCP000001");
@@ -125,7 +128,7 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the API response as a list (invisibly), including the dataset `$id`
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @import httr2
 #' @export
@@ -133,11 +136,11 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' @examples
 #' \dontrun{
 #' token <- get_auth("user@example.com", agree = TRUE)
-#' ds <- create_dataset(
+#' ds_id <- create_dataset(
 #'   auth_token = token,
 #'   samples = c("SCPCS000001", "SCPCS000002")
 #' )
-#' ds$id
+#' ds_id
 #' }
 create_dataset <- function(
   samples = NULL,
@@ -179,7 +182,7 @@ create_dataset <- function(
     resp_body_json()
 
   message(glue::glue("ScPCA dataset {response$id} created."))
-  invisible(response)
+  invisible(response$id)
 }
 
 
@@ -193,8 +196,9 @@ create_dataset <- function(
 #' it is also used by the dataset modification functions to fetch current
 #' contents before updating.
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param auth_token an authorization token obtained from [get_auth()];
 #'  must match the token used to create the dataset.
 #'
@@ -254,8 +258,9 @@ dataset_status_from_detail <- function(detail) {
 #'   expired and must be regenerated
 #' * `"failed"`: processing failed
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
@@ -294,13 +299,13 @@ get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKE
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002"))
+#' replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002"))
 #' }
 replace_dataset_data <- function(
   dataset,
@@ -319,8 +324,8 @@ replace_dataset_data <- function(
 
   data <- build_dataset_data(samples = samples, projects = projects, include_bulk = include_bulk)
 
-  response <- update_dataset(dataset_id, list(data = data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = data), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -337,13 +342,13 @@ replace_dataset_data <- function(
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' set_dataset_email(ds, email = "user@example.com")
+#' set_dataset_email(ds_id, email = "user@example.com")
 #' }
 set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
   auth_token <- resolve_auth_token(auth_token)
@@ -354,8 +359,8 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT
   )
   dataset_id <- resolve_dataset_id(dataset)
 
-  response <- update_dataset(dataset_id, list(email = email), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(email = email), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -373,24 +378,23 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT
 #' * A `"processing"` or `"succeeded"` dataset is already underway or done;
 #'   a message is emitted and no request is sent.
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param email optional email address for the download notification. When
 #'   supplied, it is set as part of the same request that starts processing.
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly) when a request is
-#'   sent, or `NULL` (invisibly) when the dataset is already processing or
-#'   completed.
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @import httr2
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
-#' start_dataset_processing(ds, email = "user@example.com")
+#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+#' start_dataset_processing(ds_id, email = "user@example.com")
 #' }
 start_dataset_processing <- function(
   dataset,
@@ -410,11 +414,11 @@ start_dataset_processing <- function(
   status <- get_dataset_status(dataset_id, auth_token = auth_token)
   if (status == "processing") {
     message(glue::glue("ScPCA dataset {dataset_id} is already processing."))
-    return(invisible(NULL))
+    return(invisible(dataset_id))
   }
   if (status == "succeeded") {
     message(glue::glue("ScPCA dataset {dataset_id} has already completed processing."))
-    return(invisible(NULL))
+    return(invisible(dataset_id))
   }
   if (status == "failed") {
     warning(
@@ -428,9 +432,9 @@ start_dataset_processing <- function(
     body$email <- email
   }
 
-  response <- update_dataset(dataset_id, body, auth_token = auth_token)
+  update_dataset(dataset_id, body, auth_token = auth_token)
   message(glue::glue("ScPCA dataset {dataset_id} processing started."))
-  invisible(response)
+  invisible(dataset_id)
 }
 
 
@@ -549,7 +553,7 @@ remove_from_dataset_data <- function(existing, samples = NULL, projects = NULL)
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @rdname modify_dataset_samples
 #' @export
@@ -585,8 +589,8 @@ add_dataset_samples <- function(
   )
   new_data <- merge_dataset_data(current$data, additions, include_bulk = include_bulk)
 
-  response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -608,8 +612,8 @@ remove_dataset_samples <- function(
   current <- get_dataset_detail(dataset_id, auth_token = auth_token)
   new_data <- remove_from_dataset_data(current$data, samples = samples, projects = projects)
 
-  response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
diff --git a/R/downloads.R b/R/downloads.R
index 1a865b6..e7171a7 100644
--- a/R/downloads.R
+++ b/R/downloads.R
@@ -433,8 +433,9 @@ parse_download_file <- function(scpca_url) {
 #' from the dataset's download filename (which includes the dataset ID, format,
 #' and date).
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param destination The path to the directory where the unzipped file directory
 #'   should be saved. Default is "scpca_data".
 #' @param overwrite Whether to overwrite files in existing directories if they

From 1e8735f66c90c5f847dd2899f43a31db1d0ca884 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 22:33:46 -0400
Subject: [PATCH 04/19] Add get_dataset_info function (and data frame helper)

---
 R/datasets.R | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/R/datasets.R b/R/datasets.R
index fe35d3f..7ce673f 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -617,6 +617,125 @@ remove_dataset_samples <- function(
 }
 
 
+#' Make a per-sample data frame from the `$data` list
+#'
+#' Transforms the project-keyed `$data` list from [get_dataset_detail()] into a
+#' one-row-per-sample data frame. Projects with merged single-cell data
+#' (`SINGLE_CELL = "MERGED"`) are excluded.
+#'
+#' @param data the project-keyed `$data` list from [get_dataset_detail()]
+#'
+#' @keywords internal
+#' @importFrom dplyr .data
+#'
+#' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`,
+#'   `modality`, and `includes_bulk`
+make_dataset_data_df <- function(data) {
+  empty <- data.frame(
+    scpca_sample_id = character(),
+    scpca_project_id = character(),
+    modality = character(),
+    includes_bulk = logical()
+  )
+  if (length(data) == 0) {
+    return(empty)
+  }
+
+  result <- data |>
+    purrr::imap(\(project, project_id) {
+      includes_bulk <- isTRUE(project$includes_bulk)
+      single_cell_ids <- project$SINGLE_CELL
+      # Datasets created outside this package may be merged.
+      # projects are excluded here and surfaced via `merged_projects` in
+      # get_dataset_info() instead.
+      if (identical(single_cell_ids, "MERGED")) {
+        return(NULL)
+      }
+      sc_ids <- as.character(single_cell_ids)
+      sp_ids <- as.character(project$SPATIAL)
+      if (length(sc_ids) == 0 && length(sp_ids) == 0) {
+        return(NULL)
+      }
+
+      data.frame(
+        scpca_sample_id = c(sc_ids, sp_ids),
+        scpca_project_id = project_id,
+        modality = rep(
+          c("single-cell", "spatial"),
+          times = c(length(sc_ids), length(sp_ids))
+        ),
+        includes_bulk = includes_bulk
+      )
+    }) |>
+    purrr::list_rbind() |>
+    dplyr::arrange(.data$scpca_sample_id)
+
+  if (nrow(result) == 0) empty else result
+}
+
+
+#' Get a summary of a custom ScPCA dataset
+#'
+#' Fetches a custom dataset and returns a structured summary of its contents,
+#' including its processing status and a per-sample table describing the modality for
+#' each sample.
+#'
+#' Projects with merged single-cell data (where individual sample IDs are not
+#' enumerated in the dataset record) are excluded from `samples` and listed in
+#' `merged_projects` instead.
+#'
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by this function).
+#' @param auth_token an authorization token from [get_auth()]. Defaults to the
+#'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
+#'
+#' @returns a named list with the following elements:
+#'   * `id`: the dataset UUID string
+#'   * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
+#'   * `status`: the processing status — one of "pending", "processing",
+#'     "succeeded", "failed", or "expired" (see [get_dataset_status()])
+#'   * `n_samples`: the number of rows in `samples` (one per sample-modality
+#'     combination; merged-single-cell projects are not counted)
+#'   * `n_projects`: the number of projects in the dataset
+#'   * `samples`: a data frame with one row per sample-modality combination and
+#'     columns `scpca_sample_id`, `scpca_project_id`, `modality` (character:
+#'     "single-cell" or "spatial"), and `includes_bulk` (logical)
+#'   * `merged_projects`: a character vector of project IDs whose single-cell
+#'     data is merged; `character(0)` when none
+#'
+#' @import httr2
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+#' info <- get_dataset_info(ds_id)
+#' info$status
+#' info$samples
+#' }
+get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
+  auth_token <- resolve_auth_token(auth_token)
+  detail <- get_dataset_detail(dataset, auth_token)
+
+  samples <- make_dataset_data_df(detail$data)
+  merged_projects <- detail$data |>
+    purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |>
+    names() |>
+    as.character()
+
+  list(
+    id = detail$id,
+    format = detail$format,
+    status = dataset_status_from_detail(detail),
+    n_samples = nrow(samples),
+    n_projects = length(detail$data),
+    samples = samples,
+    merged_projects = merged_projects
+  )
+}
+
+
 #' Get CCDL dataset objects from the ScPCA API
 #'
 #' @param project_id Optional ScPCA project ID to filter by (e.g. "SCPCP000001")

From b13835a3104129c0f1812dc01bdd6dbd41970370 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 22:47:30 -0400
Subject: [PATCH 05/19] Simplify and consolidate tests

---
 tests/testthat/test-datasets.R | 499 +++++++++++++--------------------
 1 file changed, 199 insertions(+), 300 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index c46e63b..073d21e 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -32,115 +32,6 @@ test_that("get_ccdl_datasets combines results across pages", {
   })
 })
 
-test_that("get_ccdl_datasets passes project_id as ccdl_project_id query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(project_id = "SCPCP000001")
-  expect_match(captured_req$url, "ccdl_project_id=SCPCP000001")
-})
-
-test_that("get_ccdl_datasets passes modality as ccdl_modality query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(modality = "SINGLE_CELL")
-  expect_match(captured_req$url, "ccdl_modality=SINGLE_CELL")
-})
-
-test_that("get_ccdl_datasets passes format as format query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(format = "ANN_DATA")
-  expect_match(captured_req$url, "format=ANN_DATA")
-})
-
-test_that("get_ccdl_datasets passes merged as ccdl_is_merged query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(merged = TRUE)
-  expect_match(captured_req$url, "ccdl_is_merged=TRUE")
-})
-
-test_that("get_ccdl_datasets passes include_multiplexed as includes_files_multiplexed query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(include_multiplexed = TRUE)
-  expect_match(captured_req$url, "includes_files_multiplexed=TRUE")
-
-  get_ccdl_datasets(include_multiplexed = FALSE)
-  expect_match(captured_req$url, "includes_files_multiplexed=FALSE")
-})
-
-test_that("get_ccdl_datasets passes metadata_only as ccdl_name=ALL_METADATA query parameter", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(metadata_only = TRUE)
-  expect_match(captured_req$url, "ccdl_name=ALL_METADATA")
-})
-
-test_that("get_ccdl_datasets includes api-key header when auth_token is provided", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets(auth_token = "test-token-abc")
-  expect_equal(
-    httr2::req_get_headers(captured_req, "reveal")$`api-key`,
-    "test-token-abc"
-  )
-})
-
-test_that("get_ccdl_datasets does not include api-key header when auth_token is empty", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    req_perform_iterative = function(req, ...) {
-      captured_req <<- req
-      list()
-    }
-  )
-
-  get_ccdl_datasets()
-  expect_null(httr2::req_get_headers(captured_req, "reveal")$`api-key`)
-})
 
 # build_dataset_data tests
 
@@ -267,6 +158,7 @@ test_that("create_dataset errors when spatial format is requested", {
 })
 
 test_that("create_dataset POSTs with start = FALSE", {
+  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -278,8 +170,8 @@ test_that("create_dataset POSTs with start = FALSE", {
       )
     },
     req_perform = \(req, ...) {
-      body <- req$body$data
-      json_response(c(body, list(id = "new-dataset-uuid")))
+      captured_req <<- req
+      json_response(c(req$body$data, list(id = "new-dataset-uuid")))
     }
   )
 
@@ -290,10 +182,11 @@ test_that("create_dataset POSTs with start = FALSE", {
     },
     "new-dataset-uuid"
   )
-  expect_false(result$start)
+  expect_false(captured_req$body$data$start)
+  expect_equal(result, "new-dataset-uuid")
 })
 
-test_that("create_dataset returns response invisibly and messages with dataset id", {
+test_that("create_dataset returns id invisibly and messages with dataset id", {
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -319,85 +212,27 @@ test_that("create_dataset returns response invisibly and messages with dataset i
     },
     "new-dataset-uuid"
   )
-  expect_equal(result$id, "new-dataset-uuid")
+  expect_equal(result, "new-dataset-uuid")
 })
 
 test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", {
   withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token")
+  captured_key <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) list(),
     req_perform = \(req, ...) {
-      json_response(list(
-        id = "new-dataset-uuid",
-        api_key = httr2::req_get_headers(req, "reveal")$`api-key`
-      ))
+      captured_key <<- httr2::req_get_headers(req, "reveal")$`api-key`
+      json_response(list(id = "new-dataset-uuid"))
     }
   )
 
   # called without auth_token; the token should come from the environment
-  result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
-  expect_equal(result$api_key, "env-token")
+  suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
+  expect_equal(captured_key, "env-token")
 })
 
 # get_dataset_detail tests
 
-test_that("get_dataset_detail returns dataset with data and status fields", {
-  local_mocked_bindings(
-    req_perform = \(req, ...) {
-      json_response(list(
-        id = DATASET_ID,
-        format = "SINGLE_CELL_EXPERIMENT",
-        data = list(
-          SCPCP000001 = list(
-            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
-            SPATIAL = list(),
-            includes_bulk = FALSE
-          )
-        ),
-        is_started = FALSE,
-        is_succeeded = FALSE,
-        total_sample_count = 2,
-        computed_file = NULL
-      ))
-    }
-  )
-
-  result <- get_dataset_detail(DATASET_ID, auth_token = "test-token")
-
-  expect_type(result, "list")
-  expect_equal(result$id, DATASET_ID)
-  expect_equal(result$format, "SINGLE_CELL_EXPERIMENT")
-  expect_false(result$is_started)
-  expect_false(result$is_succeeded)
-})
-
-test_that("get_dataset_detail returns data field with project and sample structure", {
-  local_mocked_bindings(
-    req_perform = \(req, ...) {
-      json_response(list(
-        id = DATASET_ID,
-        format = "SINGLE_CELL_EXPERIMENT",
-        data = list(
-          SCPCP000001 = list(
-            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
-            SPATIAL = list(),
-            includes_bulk = FALSE
-          )
-        )
-      ))
-    }
-  )
-
-  result <- get_dataset_detail(DATASET_ID, auth_token = "test-token")
-
-  expect_type(result$data, "list")
-  expect_true("SCPCP000001" %in% names(result$data))
-  expect_contains(
-    result$data$SCPCP000001$SINGLE_CELL,
-    c("SCPCS000001", "SCPCS000002")
-  )
-})
-
 test_that("get_dataset_detail includes api-key header when auth_token is provided", {
   local_mocked_bindings(
     req_perform = \(req, ...) {
@@ -450,32 +285,6 @@ test_that("get_ccdl_datasets handles 403 errors with an authorization message",
   )
 })
 
-test_that("get_dataset_detail accepts a list with $id in place of a string", {
-  local_mocked_bindings(
-    req_perform = \(req, ...) {
-      json_response(list(id = DATASET_ID, data = list()))
-    }
-  )
-
-  dataset_list <- list(id = DATASET_ID, data = list())
-  result <- get_dataset_detail(dataset_list, auth_token = "test-token")
-  expect_equal(result$id, DATASET_ID)
-})
-
-test_that("get_dataset_detail errors when list has no $id element", {
-  expect_error(
-    get_dataset_detail(list(data = list()), auth_token = "test-token"),
-    "dataset must be an id string or contain an \\$id element"
-  )
-})
-
-test_that("get_dataset_detail errors when dataset is not a string or list", {
-  expect_error(
-    get_dataset_detail(123, auth_token = "test-token"),
-    "dataset must be an id string or contain an \\$id element"
-  )
-})
-
 # get_dataset_status tests
 
 test_that("get_dataset_status maps detail status fields to a status string", {
@@ -549,6 +358,172 @@ test_that("get_dataset_status errors when auth_token is empty", {
 })
 
 
+# get_dataset_info tests
+
+test_that("get_dataset_info returns structured summary with samples data frame", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        is_succeeded = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000002 = list(
+            SINGLE_CELL = list("SCPCS000003"),
+            SPATIAL = list("SCPCS000003"),
+            includes_bulk = TRUE
+          )
+        )
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  expect_equal(info$id, DATASET_ID)
+  expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
+  expect_equal(info$status, "pending")
+  expect_equal(info$n_projects, 2)
+  # SCPCS000003 appears in both modalities: 2 SC rows + 1 spatial = 4 rows total
+  expect_equal(info$n_samples, 4)
+  expect_equal(info$merged_projects, character(0))
+  expect_s3_class(info$samples, "data.frame")
+  expect_setequal(
+    colnames(info$samples),
+    c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk")
+  )
+  # SCPCS000003 has two rows — one per modality
+  rows_003 <- info$samples[info$samples$scpca_sample_id == "SCPCS000003", ]
+  expect_setequal(rows_003$modality, c("single-cell", "spatial"))
+  # SCPCP000001 samples should not have includes_bulk
+  rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ]
+  expect_false(all(rows_p1$includes_bulk))
+  # SCPCP000002 samples should have includes_bulk
+  rows_p2 <- info$samples[info$samples$scpca_project_id == "SCPCP000002", ]
+  expect_true(all(rows_p2$includes_bulk))
+})
+
+test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "ANN_DATA",
+        is_started = FALSE,
+        data = list()
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  expect_equal(info$n_samples, 0)
+  expect_equal(info$n_projects, 0)
+  expect_equal(nrow(info$samples), 0)
+  expect_setequal(
+    colnames(info$samples),
+    c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk")
+  )
+})
+
+test_that("get_dataset_info surfaces merged projects separately and excludes them from samples", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000005 = list(
+            SINGLE_CELL = "MERGED",
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        )
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  # merged project excluded from samples and n_samples
+  expect_equal(info$n_samples, 1)
+  expect_equal(info$samples$scpca_sample_id, "SCPCS000001")
+  # but counted in n_projects and surfaced in merged_projects
+  expect_equal(info$n_projects, 2)
+  expect_equal(info$merged_projects, "SCPCP000005")
+})
+
+test_that("get_dataset_info derives status from detail without a second API call", {
+  call_count <- 0
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      call_count <<- call_count + 1
+      list(
+        id = DATASET_ID,
+        format = "ANN_DATA",
+        is_started = TRUE,
+        is_succeeded = TRUE,
+        data = list()
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  expect_equal(call_count, 1)
+  expect_equal(info$status, "succeeded")
+})
+
+test_that("get_dataset_info prunes projects where both modality lists are empty", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000002 = list(
+            SINGLE_CELL = list(),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        )
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  expect_equal(info$n_samples, 1)
+  expect_equal(info$samples$scpca_project_id, "SCPCP000001")
+  expect_false("SCPCP000002" %in% info$samples$scpca_project_id)
+})
+
+test_that("get_dataset_info errors when auth_token is empty", {
+  expect_error(
+    get_dataset_info(DATASET_ID, auth_token = ""),
+    "Authorization token must be provided"
+  )
+})
+
+
 test_that("get_ccdl_dataset_detail returns dataset fields including download_url", {
   with_mock_dir("ccdl_dataset_detail", {
     result <- get_ccdl_dataset_detail("abc123", auth_token = "test-token")
@@ -609,7 +584,7 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
     },
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(req$body$data)
+      json_response(list(id = DATASET_ID, data = req$body$data))
     }
   )
 
@@ -621,8 +596,8 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
 
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_null(result$format)
-  expect_true("SCPCP000001" %in% names(result$data))
+  expect_null(captured_req$body$data$format)
+  expect_equal(result, DATASET_ID)
 })
 
 # set_dataset_email tests
@@ -632,7 +607,7 @@ test_that("set_dataset_email PUTs a new email", {
   local_mocked_bindings(
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(req$body$data)
+      json_response(list(id = DATASET_ID, email = req$body$data$email))
     }
   )
 
@@ -643,7 +618,8 @@ test_that("set_dataset_email PUTs a new email", {
   )
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_equal(result$email, "user@example.com")
+  expect_equal(captured_req$body$data$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("set_dataset_email errors when email is not a single string", {
@@ -680,7 +656,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
     get_dataset_status = \(dataset, auth_token) "pending",
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(req$body$data)
+      json_response(list(id = DATASET_ID))
     }
   )
 
@@ -696,30 +672,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
   )
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_true(result$start)
-  expect_null(result$email)
-})
-
-test_that("start_dataset_processing includes email in the same request when provided", {
-  captured_req <- NULL
-  local_mocked_bindings(
-    get_dataset_status = \(dataset, auth_token) "pending",
-    req_perform = \(req, ...) {
-      captured_req <<- req
-      json_response(req$body$data)
-    }
-  )
-
-  result <- suppressMessages(
-    start_dataset_processing(
-      DATASET_ID,
-      email = "user@example.com",
-      auth_token = "token"
-    )
-  )
-  expect_equal(captured_req$method, "PUT")
-  expect_true(result$start)
-  expect_equal(result$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("start_dataset_processing errors when email is not a single string", {
@@ -754,7 +707,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "is already processing"
   )
-  expect_null(result)
+  expect_equal(result, DATASET_ID)
   expect_false(put_called)
 })
 
@@ -772,7 +725,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "has already completed processing"
   )
-  expect_null(result)
+  expect_equal(result, DATASET_ID)
   expect_false(put_called)
 })
 
@@ -793,7 +746,6 @@ test_that("start_dataset_processing warns and retries when previously failed", {
     "previously failed to process"
   )
   expect_equal(captured_req$method, "PUT")
-  expect_true(captured_req$body$data$start)
 })
 
 test_that("start_dataset_processing restarts an expired dataset", {
@@ -810,7 +762,6 @@ test_that("start_dataset_processing restarts an expired dataset", {
     start_dataset_processing(DATASET_ID, auth_token = "token")
   )
   expect_equal(captured_req$method, "PUT")
-  expect_true(captured_req$body$data$start)
 })
 
 test_that("start_dataset_processing surfaces a locked-dataset error on a 409 race", {
@@ -907,79 +858,27 @@ test_that("remove_from_dataset_data drops whole projects", {
 
 # add_dataset_samples / remove_dataset_samples tests
 
-test_that("add_dataset_samples merges new samples into existing data and PUTs", {
-  captured_req <- NULL
+test_that("add_dataset_samples PUTs", {
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
-      list(
-        id = DATASET_ID,
-        data = list(
-          SCPCP000001 = list(
-            SINGLE_CELL = list("SCPCS000001"),
-            SPATIAL = list(),
-            includes_bulk = FALSE
-          )
-        )
-      )
+      list(id = DATASET_ID, data = list())
     },
-    build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) {
-      list(
-        SCPCP000001 = list(
-          SINGLE_CELL = list("SCPCS000002"),
-          SPATIAL = list(),
-          includes_bulk = include_bulk
-        )
-      )
-    },
-    req_perform = \(req, ...) {
-      captured_req <<- req
-      json_response(req$body$data)
-    }
+    build_dataset_data = \(...) list(),
+    req_perform = \(req, ...) json_response(list(id = DATASET_ID))
   )
 
-  result <- add_dataset_samples(
-    DATASET_ID,
-    auth_token = "token",
-    samples = "SCPCS000002"
-  )
-  expect_equal(captured_req$method, "PUT")
-  expect_setequal(
-    as.character(result$data$SCPCP000001$SINGLE_CELL),
-    c("SCPCS000001", "SCPCS000002")
-  )
+  result <- add_dataset_samples(DATASET_ID, auth_token = "token", samples = "SCPCS000002")
+  expect_equal(result, DATASET_ID)
 })
 
-test_that("remove_dataset_samples removes a project and PUTs", {
-  captured_req <- NULL
+test_that("remove_dataset_samples PUTs", {
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
-      list(
-        id = DATASET_ID,
-        data = list(
-          SCPCP000001 = list(
-            SINGLE_CELL = list("SCPCS000001"),
-            SPATIAL = list(),
-            includes_bulk = FALSE
-          ),
-          SCPCP000002 = list(
-            SINGLE_CELL = list("SCPCS000003"),
-            SPATIAL = list(),
-            includes_bulk = FALSE
-          )
-        )
-      )
+      list(id = DATASET_ID, data = list())
     },
-    req_perform = \(req, ...) {
-      captured_req <<- req
-      json_response(req$body$data)
-    }
+    req_perform = \(req, ...) json_response(list(id = DATASET_ID))
   )
 
-  result <- remove_dataset_samples(
-    DATASET_ID,
-    auth_token = "token",
-    projects = "SCPCP000002"
-  )
-  expect_equal(captured_req$method, "PUT")
-  expect_equal(names(result$data), "SCPCP000001")
+  result <- remove_dataset_samples(DATASET_ID, auth_token = "token", projects = "SCPCP000002")
+  expect_equal(result, DATASET_ID)
 })

From afdb1df7d005f3ec899e87b3331db6a2d72fc34d Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 23:07:54 -0400
Subject: [PATCH 06/19] fix testing indentation error

---
 tests/testthat/test-projects.R | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/testthat/test-projects.R b/tests/testthat/test-projects.R
index a4725d6..a1bb578 100644
--- a/tests/testthat/test-projects.R
+++ b/tests/testthat/test-projects.R
@@ -21,29 +21,29 @@ test_that("scpca_projects returns simplified data frame by default", {
     expect_s3_class(projects_df$created_at, "POSIXct")
     expect_s3_class(projects_df$updated_at, "POSIXct")
   })
+})
 
-  test_that("scpca_projects returns full data frame when simplify = FALSE", {
-    with_mock_dir("scpca_projects", {
-      projects_df <- scpca_projects(simplify = FALSE)
+test_that("scpca_projects returns full data frame when simplify = FALSE", {
+  with_mock_dir("scpca_projects", {
+    projects_df <- scpca_projects(simplify = FALSE)
 
-      # Check that it returns a data frame
-      expect_s3_class(projects_df, "data.frame")
+    # Check that it returns a data frame
+    expect_s3_class(projects_df, "data.frame")
 
-      # Check that we have rows and columns
-      expect_gt(nrow(projects_df), 0)
-      expect_gt(ncol(projects_df), 0)
+    # Check that we have rows and columns
+    expect_gt(nrow(projects_df), 0)
+    expect_gt(ncol(projects_df), 0)
 
-      # Check that list columns are present (not simplified)
-      list_columns <- sapply(projects_df, is.list)
-      expect_true(any(list_columns))
+    # Check that list columns are present (not simplified)
+    list_columns <- sapply(projects_df, is.list)
+    expect_true(any(list_columns))
 
-      # Check for expected key columns
-      expect_contains(colnames(projects_df), "scpca_project_id")
+    # Check for expected key columns
+    expect_contains(colnames(projects_df), "scpca_project_id")
 
-      # Check that date columns are properly converted
-      expect_s3_class(projects_df$created_at, "POSIXct")
-      expect_s3_class(projects_df$updated_at, "POSIXct")
-    })
+    # Check that date columns are properly converted
+    expect_s3_class(projects_df$created_at, "POSIXct")
+    expect_s3_class(projects_df$updated_at, "POSIXct")
   })
 })
 

From abb682dd107cb43ccd1bd5129c806a7711f98cd6 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Wed, 3 Jun 2026 23:09:30 -0400
Subject: [PATCH 07/19] document

---
 NAMESPACE                         |  1 +
 _pkgdown.yml                      |  1 +
 man/await_dataset_processing.Rd   |  5 +--
 man/create_dataset.Rd             | 10 +++---
 man/dataset_status_from_detail.Rd | 19 ++++++++++++
 man/download_dataset.Rd           |  5 +--
 man/get_dataset_detail.Rd         |  5 +--
 man/get_dataset_info.Rd           | 51 +++++++++++++++++++++++++++++++
 man/get_dataset_status.Rd         | 23 +++++++-------
 man/make_dataset_data_df.Rd       | 21 +++++++++++++
 man/modify_dataset_samples.Rd     |  2 +-
 man/replace_dataset_data.Rd       |  4 +--
 man/resolve_dataset_id.Rd         |  9 +++---
 man/set_dataset_email.Rd          |  4 +--
 man/start_dataset_processing.Rd   | 21 ++++++++-----
 15 files changed, 144 insertions(+), 37 deletions(-)
 create mode 100644 man/dataset_status_from_detail.Rd
 create mode 100644 man/get_dataset_info.Rd
 create mode 100644 man/make_dataset_data_df.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 28593d4..66eb4ea 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(download_dataset)
 export(download_project)
 export(download_sample)
 export(get_auth)
+export(get_dataset_info)
 export(get_dataset_status)
 export(get_project_info)
 export(get_project_libraries)
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 73870af..986e2a4 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -32,6 +32,7 @@ reference:
     contents:
       - create_dataset
       - get_dataset_status
+      - get_dataset_info
       - download_dataset
       - add_dataset_samples
       - replace_dataset_data
diff --git a/man/await_dataset_processing.Rd b/man/await_dataset_processing.Rd
index 1109953..599a465 100644
--- a/man/await_dataset_processing.Rd
+++ b/man/await_dataset_processing.Rd
@@ -13,8 +13,9 @@ await_dataset_processing(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{poll_interval}{Number of minutes to wait between status checks when
 \code{await_processing = TRUE}. Default is 0.5 (30 seconds).}
diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd
index 606c502..55f2344 100644
--- a/man/create_dataset.Rd
+++ b/man/create_dataset.Rd
@@ -31,19 +31,21 @@ spatial samples are always returned in Space Ranger format.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the API response as a list (invisibly), including the dataset \verb{$id}
+the dataset ID as a character string (invisibly)
 }
 \description{
 Creates a new user dataset without starting processing.
-The returned list includes the dataset \verb{$id} along with its current contents and status.
+Returns the new dataset's ID (invisibly), which you can pass to the other
+dataset functions such as \code{\link[=get_dataset_info]{get_dataset_info()}}, \code{\link[=add_dataset_samples]{add_dataset_samples()}}, and
+\code{\link[=start_dataset_processing]{start_dataset_processing()}}.
 }
 \examples{
 \dontrun{
 token <- get_auth("user@example.com", agree = TRUE)
-ds <- create_dataset(
+ds_id <- create_dataset(
   auth_token = token,
   samples = c("SCPCS000001", "SCPCS000002")
 )
-ds$id
+ds_id
 }
 }
diff --git a/man/dataset_status_from_detail.Rd b/man/dataset_status_from_detail.Rd
new file mode 100644
index 0000000..1c71ea0
--- /dev/null
+++ b/man/dataset_status_from_detail.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{dataset_status_from_detail}
+\alias{dataset_status_from_detail}
+\title{Map dataset detail status flags to a status string}
+\usage{
+dataset_status_from_detail(detail)
+}
+\arguments{
+\item{detail}{the dataset detail list returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}}
+}
+\value{
+a single character string: one of "pending", "processing",
+"succeeded", "failed", or "expired"
+}
+\description{
+Map dataset detail status flags to a status string
+}
+\keyword{internal}
diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd
index 39bb6cd..a996509 100644
--- a/man/download_dataset.Rd
+++ b/man/download_dataset.Rd
@@ -18,8 +18,9 @@ download_dataset(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{destination}{The path to the directory where the unzipped file directory
 should be saved. Default is "scpca_data".}
diff --git a/man/get_dataset_detail.Rd b/man/get_dataset_detail.Rd
index 2005b35..4f56d9d 100644
--- a/man/get_dataset_detail.Rd
+++ b/man/get_dataset_detail.Rd
@@ -7,8 +7,9 @@
 get_dataset_detail(dataset, auth_token)
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{auth_token}{an authorization token obtained from \code{\link[=get_auth]{get_auth()}};
 must match the token used to create the dataset.}
diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd
new file mode 100644
index 0000000..c73a65a
--- /dev/null
+++ b/man/get_dataset_info.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{get_dataset_info}
+\alias{get_dataset_info}
+\title{Get a summary of a custom ScPCA dataset}
+\usage{
+get_dataset_info(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
+}
+\arguments{
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by this function).}
+
+\item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the
+\code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
+}
+\value{
+a named list with the following elements:
+\itemize{
+\item \code{id}: the dataset UUID string
+\item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
+\item \code{status}: the processing status — one of "pending", "processing",
+"succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}})
+\item \code{n_samples}: the number of rows in \code{samples} (one per sample-modality
+combination; merged-single-cell projects are not counted)
+\item \code{n_projects}: the number of projects in the dataset
+\item \code{samples}: a data frame with one row per sample-modality combination and
+columns \code{scpca_sample_id}, \code{scpca_project_id}, \code{modality} (character:
+"single-cell" or "spatial"), and \code{includes_bulk} (logical)
+\item \code{merged_projects}: a character vector of project IDs whose single-cell
+data is merged; \code{character(0)} when none
+}
+}
+\description{
+Fetches a custom dataset and returns a structured summary of its contents,
+including its processing status and a per-sample table describing the modality for
+each sample.
+}
+\details{
+Projects with merged single-cell data (where individual sample IDs are not
+enumerated in the dataset record) are excluded from \code{samples} and listed in
+\code{merged_projects} instead.
+}
+\examples{
+\dontrun{
+ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+info <- get_dataset_info(ds_id)
+info$status
+info$samples
+}
+}
diff --git a/man/get_dataset_status.Rd b/man/get_dataset_status.Rd
index 4a56ef8..3f60353 100644
--- a/man/get_dataset_status.Rd
+++ b/man/get_dataset_status.Rd
@@ -7,8 +7,9 @@
 get_dataset_status(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
@@ -19,21 +20,21 @@ a single character string: one of "pending", "processing",
 }
 \description{
 Returns a single string describing where a dataset is in the processing
-lifecycle, by fetching the dataset detail and translating its status fields
-(\code{is_started}, \code{is_succeeded}, \code{is_failed}). A dataset that has been started
-but has neither succeeded nor failed is reported as "processing".
+lifecycle.
 }
 \details{
 Possible values are:
-\describe{
-\item{"pending"}{the dataset has not been started}
-\item{"processing"}{the dataset has been started but is not yet finished}
-\item{"succeeded"}{processing finished and the dataset is ready to download}
-\item{"failed"}{processing failed}
+\itemize{
+\item \code{"pending"}: the dataset has not been started
+\item \code{"processing"}: the dataset has been started but is not yet finished
+\item \code{"succeeded"}: processing finished and the dataset is ready to download
+\item \code{"expired"}: processing completed but the generated download has since
+expired and must be regenerated
+\item \code{"failed"}: processing failed
 }
 }
 \examples{
 \dontrun{
-get_dataset_status(ds)
+get_dataset_status(ds_id)
 }
 }
diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd
new file mode 100644
index 0000000..583bba4
--- /dev/null
+++ b/man/make_dataset_data_df.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{make_dataset_data_df}
+\alias{make_dataset_data_df}
+\title{Make a per-sample data frame from the \verb{$data} list}
+\usage{
+make_dataset_data_df(data)
+}
+\arguments{
+\item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}}
+}
+\value{
+a data frame with columns \code{scpca_sample_id}, \code{scpca_project_id},
+\code{modality}, and \code{includes_bulk}
+}
+\description{
+Transforms the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}} into a
+one-row-per-sample data frame. Projects with merged single-cell data
+(\code{SINGLE_CELL = "MERGED"}) are excluded.
+}
+\keyword{internal}
diff --git a/man/modify_dataset_samples.Rd b/man/modify_dataset_samples.Rd
index 61604ec..6cfc56d 100644
--- a/man/modify_dataset_samples.Rd
+++ b/man/modify_dataset_samples.Rd
@@ -36,7 +36,7 @@ projects keep their current value. Default is FALSE.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 \code{add_dataset_samples()} adds the given samples and/or all samples from the
diff --git a/man/replace_dataset_data.Rd b/man/replace_dataset_data.Rd
index 15afdf5..4b88c6e 100644
--- a/man/replace_dataset_data.Rd
+++ b/man/replace_dataset_data.Rd
@@ -26,7 +26,7 @@ all samples from each project are included.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Replaces the samples and/or projects in an existing dataset with a new
@@ -40,6 +40,6 @@ A dataset that has already started processing cannot be updated.
 }
 \examples{
 \dontrun{
-replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002"))
+replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002"))
 }
 }
diff --git a/man/resolve_dataset_id.Rd b/man/resolve_dataset_id.Rd
index ee7e54c..2588243 100644
--- a/man/resolve_dataset_id.Rd
+++ b/man/resolve_dataset_id.Rd
@@ -10,11 +10,12 @@ resolve_dataset_id(dataset)
 \item{dataset}{a dataset UUID string, or a list with an \verb{$id} element}
 }
 \value{
-the dataset ID as a length-1 character string
+the dataset ID as a character string
 }
 \description{
-Accepts either a dataset UUID string or a list with an \verb{$id} element (such as
-the return value of \code{\link[=create_dataset]{create_dataset()}} or \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns
-the ID string, after checking that it is a valid UUID.
+Accepts either a dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}) or a list with an \verb{$id} element (such as the value returned
+by \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns the ID string, after checking that it is
+a valid UUID.
 }
 \keyword{internal}
diff --git a/man/set_dataset_email.Rd b/man/set_dataset_email.Rd
index cc28f59..a245d01 100644
--- a/man/set_dataset_email.Rd
+++ b/man/set_dataset_email.Rd
@@ -15,7 +15,7 @@ set_dataset_email(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Updates the email address the ScPCA Portal will use to notify you when the
@@ -27,6 +27,6 @@ A dataset that has already been started cannot be modified.
 }
 \examples{
 \dontrun{
-set_dataset_email(ds, email = "user@example.com")
+set_dataset_email(ds_id, email = "user@example.com")
 }
 }
diff --git a/man/start_dataset_processing.Rd b/man/start_dataset_processing.Rd
index b61909e..fd7a61b 100644
--- a/man/start_dataset_processing.Rd
+++ b/man/start_dataset_processing.Rd
@@ -11,8 +11,9 @@ start_dataset_processing(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{email}{optional email address for the download notification. When
 supplied, it is set as part of the same request that starts processing.}
@@ -21,7 +22,7 @@ supplied, it is set as part of the same request that starts processing.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Starts processing of an existing custom dataset so that its files can be
@@ -29,12 +30,18 @@ built for download, by sending a PUT request that sets \code{start = TRUE}.
 Optionally sets the notification email as part of the same request.
 }
 \details{
-Once processing has started a dataset is locked and can no longer be
-modified; attempting to modify or re-start it will raise an error.
+Before sending the request the current dataset status is checked via
+\code{\link[=get_dataset_status]{get_dataset_status()}}:
+\itemize{
+\item A \code{"pending"} or \code{"expired"} dataset is started normally.
+\item A \code{"failed"} dataset is retried with a warning.
+\item A \code{"processing"} or \code{"succeeded"} dataset is already underway or done;
+a message is emitted and no request is sent.
+}
 }
 \examples{
 \dontrun{
-ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
-start_dataset_processing(ds, email = "user@example.com")
+ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+start_dataset_processing(ds_id, email = "user@example.com")
 }
 }

From 0c9d05a2883fd63476e00613c8f94d26da620311 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 09:55:21 -0400
Subject: [PATCH 08/19] Revert test removals

---
 tests/testthat/test-datasets.R | 257 ++++++++++++++++++++++++++++++---
 1 file changed, 236 insertions(+), 21 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index b4cbfde..6b640eb 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -32,6 +32,115 @@ test_that("get_ccdl_datasets combines results across pages", {
   })
 })
 
+test_that("get_ccdl_datasets passes project_id as ccdl_project_id query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(project_id = "SCPCP000001")
+  expect_match(captured_req$url, "ccdl_project_id=SCPCP000001")
+})
+
+test_that("get_ccdl_datasets passes modality as ccdl_modality query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(modality = "SINGLE_CELL")
+  expect_match(captured_req$url, "ccdl_modality=SINGLE_CELL")
+})
+
+test_that("get_ccdl_datasets passes format as format query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(format = "ANN_DATA")
+  expect_match(captured_req$url, "format=ANN_DATA")
+})
+
+test_that("get_ccdl_datasets passes merged as ccdl_is_merged query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(merged = TRUE)
+  expect_match(captured_req$url, "ccdl_is_merged=TRUE")
+})
+
+test_that("get_ccdl_datasets passes include_multiplexed as includes_files_multiplexed query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(include_multiplexed = TRUE)
+  expect_match(captured_req$url, "includes_files_multiplexed=TRUE")
+
+  get_ccdl_datasets(include_multiplexed = FALSE)
+  expect_match(captured_req$url, "includes_files_multiplexed=FALSE")
+})
+
+test_that("get_ccdl_datasets passes metadata_only as ccdl_name=ALL_METADATA query parameter", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(metadata_only = TRUE)
+  expect_match(captured_req$url, "ccdl_name=ALL_METADATA")
+})
+
+test_that("get_ccdl_datasets includes api-key header when auth_token is provided", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets(auth_token = "test-token-abc")
+  expect_equal(
+    httr2::req_get_headers(captured_req, "reveal")$`api-key`,
+    "test-token-abc"
+  )
+})
+
+test_that("get_ccdl_datasets does not include api-key header when auth_token is empty", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    req_perform_iterative = function(req, ...) {
+      captured_req <<- req
+      list()
+    }
+  )
+
+  get_ccdl_datasets()
+  expect_null(httr2::req_get_headers(captured_req, "reveal")$`api-key`)
+})
 
 # build_dataset_data tests
 
@@ -158,7 +267,6 @@ test_that("create_dataset errors when spatial format is requested", {
 })
 
 test_that("create_dataset POSTs with start = FALSE", {
-  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -170,8 +278,8 @@ test_that("create_dataset POSTs with start = FALSE", {
       )
     },
     req_perform = \(req, ...) {
-      captured_req <<- req
-      json_response(c(req$body$data, list(id = "new-dataset-uuid")))
+      body <- req$body$data
+      json_response(c(body, list(id = "new-dataset-uuid")))
     }
   )
 
@@ -182,11 +290,10 @@ test_that("create_dataset POSTs with start = FALSE", {
     },
     "new-dataset-uuid"
   )
-  expect_false(captured_req$body$data$start)
-  expect_equal(result, "new-dataset-uuid")
+  expect_false(result$start)
 })
 
-test_that("create_dataset returns id invisibly and messages with dataset id", {
+test_that("create_dataset returns response invisibly and messages with dataset id", {
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -212,27 +319,85 @@ test_that("create_dataset returns id invisibly and messages with dataset id", {
     },
     "new-dataset-uuid"
   )
-  expect_equal(result, "new-dataset-uuid")
+  expect_equal(result$id, "new-dataset-uuid")
 })
 
 test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", {
   withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token")
-  captured_key <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) list(),
     req_perform = \(req, ...) {
-      captured_key <<- httr2::req_get_headers(req, "reveal")$`api-key`
-      json_response(list(id = "new-dataset-uuid"))
+      json_response(list(
+        id = "new-dataset-uuid",
+        api_key = httr2::req_get_headers(req, "reveal")$`api-key`
+      ))
     }
   )
 
   # called without auth_token; the token should come from the environment
-  suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
-  expect_equal(captured_key, "env-token")
+  result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
+  expect_equal(result$api_key, "env-token")
 })
 
 # get_dataset_detail tests
 
+test_that("get_dataset_detail returns dataset with data and status fields", {
+  local_mocked_bindings(
+    req_perform = \(req, ...) {
+      json_response(list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        ),
+        is_started = FALSE,
+        is_succeeded = FALSE,
+        total_sample_count = 2,
+        computed_file = NULL
+      ))
+    }
+  )
+
+  result <- get_dataset_detail(DATASET_ID, auth_token = "test-token")
+
+  expect_type(result, "list")
+  expect_equal(result$id, DATASET_ID)
+  expect_equal(result$format, "SINGLE_CELL_EXPERIMENT")
+  expect_false(result$is_started)
+  expect_false(result$is_succeeded)
+})
+
+test_that("get_dataset_detail returns data field with project and sample structure", {
+  local_mocked_bindings(
+    req_perform = \(req, ...) {
+      json_response(list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        )
+      ))
+    }
+  )
+
+  result <- get_dataset_detail(DATASET_ID, auth_token = "test-token")
+
+  expect_type(result$data, "list")
+  expect_true("SCPCP000001" %in% names(result$data))
+  expect_contains(
+    result$data$SCPCP000001$SINGLE_CELL,
+    c("SCPCS000001", "SCPCS000002")
+  )
+})
+
 test_that("get_dataset_detail includes api-key header when auth_token is provided", {
   local_mocked_bindings(
     req_perform = \(req, ...) {
@@ -285,6 +450,32 @@ test_that("get_ccdl_datasets handles 403 errors with an authorization message",
   )
 })
 
+test_that("get_dataset_detail accepts a list with $id in place of a string", {
+  local_mocked_bindings(
+    req_perform = \(req, ...) {
+      json_response(list(id = DATASET_ID, data = list()))
+    }
+  )
+
+  dataset_list <- list(id = DATASET_ID, data = list())
+  result <- get_dataset_detail(dataset_list, auth_token = "test-token")
+  expect_equal(result$id, DATASET_ID)
+})
+
+test_that("get_dataset_detail errors when list has no $id element", {
+  expect_error(
+    get_dataset_detail(list(data = list()), auth_token = "test-token"),
+    "dataset must be an id string or contain an \\$id element"
+  )
+})
+
+test_that("get_dataset_detail errors when dataset is not a string or list", {
+  expect_error(
+    get_dataset_detail(123, auth_token = "test-token"),
+    "dataset must be an id string or contain an \\$id element"
+  )
+})
+
 # get_dataset_status tests
 
 test_that("get_dataset_status maps detail status fields to a status string", {
@@ -584,7 +775,7 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
     },
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(list(id = DATASET_ID, data = req$body$data))
+      json_response(req$body$data)
     }
   )
 
@@ -596,8 +787,8 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
 
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_null(captured_req$body$data$format)
-  expect_equal(result, DATASET_ID)
+  expect_null(result$format)
+  expect_true("SCPCP000001" %in% names(result$data))
 })
 
 # set_dataset_email tests
@@ -607,7 +798,7 @@ test_that("set_dataset_email PUTs a new email", {
   local_mocked_bindings(
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(list(id = DATASET_ID, email = req$body$data$email))
+      json_response(req$body$data)
     }
   )
 
@@ -618,8 +809,7 @@ test_that("set_dataset_email PUTs a new email", {
   )
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_equal(captured_req$body$data$email, "user@example.com")
-  expect_equal(result, DATASET_ID)
+  expect_equal(result$email, "user@example.com")
 })
 
 test_that("set_dataset_email errors when email is not a single string", {
@@ -656,7 +846,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
     get_dataset_status = \(dataset, auth_token) "pending",
     req_perform = \(req, ...) {
       captured_req <<- req
-      json_response(list(id = DATASET_ID))
+      json_response(req$body$data)
     }
   )
 
@@ -672,7 +862,30 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
   )
   expect_equal(captured_req$method, "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_equal(result, DATASET_ID)
+  expect_true(result$start)
+  expect_null(result$email)
+})
+
+test_that("start_dataset_processing includes email in the same request when provided", {
+  captured_req <- NULL
+  local_mocked_bindings(
+    get_dataset_status = \(dataset, auth_token) "pending",
+    req_perform = \(req, ...) {
+      captured_req <<- req
+      json_response(req$body$data)
+    }
+  )
+
+  result <- suppressMessages(
+    start_dataset_processing(
+      DATASET_ID,
+      email = "user@example.com",
+      auth_token = "token"
+    )
+  )
+  expect_equal(captured_req$method, "PUT")
+  expect_true(result$start)
+  expect_equal(result$email, "user@example.com")
 })
 
 test_that("start_dataset_processing errors when email is not a single string", {
@@ -707,7 +920,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "is already processing"
   )
-  expect_equal(result, DATASET_ID)
+  expect_null(result)
   expect_false(put_called)
 })
 
@@ -746,6 +959,7 @@ test_that("start_dataset_processing warns and retries when previously failed", {
     "previously failed to process"
   )
   expect_equal(captured_req$method, "PUT")
+  expect_true(captured_req$body$data$start)
 })
 
 test_that("start_dataset_processing restarts an expired dataset", {
@@ -762,6 +976,7 @@ test_that("start_dataset_processing restarts an expired dataset", {
     start_dataset_processing(DATASET_ID, auth_token = "token")
   )
   expect_equal(captured_req$method, "PUT")
+  expect_true(captured_req$body$data$start)
 })
 
 test_that("start_dataset_processing surfaces a locked-dataset error on a 409 race", {

From e1f96cbb2da053edeafba631fb285f44de6eada8 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 10:00:10 -0400
Subject: [PATCH 09/19] ignore claude directory for rbuild

---
 .Rbuildignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.Rbuildignore b/.Rbuildignore
index d3bd575..a90605e 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -2,6 +2,7 @@
 ^LICENSE\.md$
 ^ScPCAr\.Rproj$
 ^\.Rproj\.user$
+^\.claude$
 ^\.github$
 ^\.pre-commit-config\.yaml$
 ^_pkgdown\.yml$

From fcc337f7e3477137a0d538f9b90317406e87a90d Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 10:12:06 -0400
Subject: [PATCH 10/19] standardize testing with more detail

---
 tests/testthat/test-datasets.R | 55 +++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index 6b640eb..bc70cea 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -267,6 +267,7 @@ test_that("create_dataset errors when spatial format is requested", {
 })
 
 test_that("create_dataset POSTs with start = FALSE", {
+  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -278,8 +279,8 @@ test_that("create_dataset POSTs with start = FALSE", {
       )
     },
     req_perform = \(req, ...) {
-      body <- req$body$data
-      json_response(c(body, list(id = "new-dataset-uuid")))
+      captured_req <<- req
+      json_response(list(id = "new-dataset-uuid"))
     }
   )
 
@@ -290,10 +291,12 @@ test_that("create_dataset POSTs with start = FALSE", {
     },
     "new-dataset-uuid"
   )
-  expect_false(result$start)
+  expect_equal(httr2::req_get_method(captured_req), "POST")
+  expect_false(captured_req$body$data$start)
+  expect_equal(result, "new-dataset-uuid")
 })
 
-test_that("create_dataset returns response invisibly and messages with dataset id", {
+test_that("create_dataset returns the dataset id invisibly and messages with dataset id", {
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -319,24 +322,24 @@ test_that("create_dataset returns response invisibly and messages with dataset i
     },
     "new-dataset-uuid"
   )
-  expect_equal(result$id, "new-dataset-uuid")
+  expect_equal(result, "new-dataset-uuid")
 })
 
 test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", {
   withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token")
+  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) list(),
     req_perform = \(req, ...) {
-      json_response(list(
-        id = "new-dataset-uuid",
-        api_key = httr2::req_get_headers(req, "reveal")$`api-key`
-      ))
+      captured_req <<- req
+      json_response(list(id = "new-dataset-uuid"))
     }
   )
 
   # called without auth_token; the token should come from the environment
   result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
-  expect_equal(result$api_key, "env-token")
+  expect_equal(httr2::req_get_headers(captured_req, "reveal")$`api-key`, "env-token")
+  expect_equal(result, "new-dataset-uuid")
 })
 
 # get_dataset_detail tests
@@ -785,10 +788,11 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
     samples = "SCPCS000001"
   )
 
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_null(result$format)
-  expect_true("SCPCP000001" %in% names(result$data))
+  expect_null(captured_req$body$data$format)
+  expect_true("SCPCP000001" %in% names(captured_req$body$data$data))
+  expect_equal(result, DATASET_ID)
 })
 
 # set_dataset_email tests
@@ -807,9 +811,10 @@ test_that("set_dataset_email PUTs a new email", {
     auth_token = "token",
     email = "user@example.com"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_equal(result$email, "user@example.com")
+  expect_equal(captured_req$body$data$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("set_dataset_email errors when email is not a single string", {
@@ -860,10 +865,11 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
     },
     "processing started"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_true(result$start)
-  expect_null(result$email)
+  expect_true(captured_req$body$data$start)
+  expect_null(captured_req$body$data$email)
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("start_dataset_processing includes email in the same request when provided", {
@@ -883,9 +889,10 @@ test_that("start_dataset_processing includes email in the same request when prov
       auth_token = "token"
     )
   )
-  expect_equal(captured_req$method, "PUT")
-  expect_true(result$start)
-  expect_equal(result$email, "user@example.com")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_true(captured_req$body$data$start)
+  expect_equal(captured_req$body$data$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("start_dataset_processing errors when email is not a single string", {
@@ -920,7 +927,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "is already processing"
   )
-  expect_null(result)
+  expect_equal(result, DATASET_ID)
   expect_false(put_called)
 })
 
@@ -958,7 +965,7 @@ test_that("start_dataset_processing warns and retries when previously failed", {
     ),
     "previously failed to process"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_true(captured_req$body$data$start)
 })
 
@@ -975,7 +982,7 @@ test_that("start_dataset_processing restarts an expired dataset", {
   suppressMessages(
     start_dataset_processing(DATASET_ID, auth_token = "token")
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_true(captured_req$body$data$start)
 })
 

From d7014df7a689ef8350bc1bea87a0856f7bbf07a2 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 10:34:14 -0400
Subject: [PATCH 11/19] fix modailty test: can't have one sample with both
 single cell and spatial!

---
 tests/testthat/test-datasets.R | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index bc70cea..81f5b80 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -570,7 +570,7 @@ test_that("get_dataset_info returns structured summary with samples data frame",
           ),
           SCPCP000002 = list(
             SINGLE_CELL = list("SCPCS000003"),
-            SPATIAL = list("SCPCS000003"),
+            SPATIAL = list("SCPCS000004"),
             includes_bulk = TRUE
           )
         )
@@ -584,7 +584,7 @@ test_that("get_dataset_info returns structured summary with samples data frame",
   expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
   expect_equal(info$status, "pending")
   expect_equal(info$n_projects, 2)
-  # SCPCS000003 appears in both modalities: 2 SC rows + 1 spatial = 4 rows total
+  # 4 sample rows: 2 single-cell in the first project, 1 single-cell + 1 spatial in the second
   expect_equal(info$n_samples, 4)
   expect_equal(info$merged_projects, character(0))
   expect_s3_class(info$samples, "data.frame")
@@ -592,9 +592,15 @@ test_that("get_dataset_info returns structured summary with samples data frame",
     colnames(info$samples),
     c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk")
   )
-  # SCPCS000003 has two rows — one per modality
-  rows_003 <- info$samples[info$samples$scpca_sample_id == "SCPCS000003", ]
-  expect_setequal(rows_003$modality, c("single-cell", "spatial"))
+  # single-cell and spatial samples are distinct, each with its own modality row
+  expect_equal(
+    info$samples$modality[info$samples$scpca_sample_id == "SCPCS000003"],
+    "single-cell"
+  )
+  expect_equal(
+    info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"],
+    "spatial"
+  )
   # SCPCP000001 samples should not have includes_bulk
   rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ]
   expect_false(all(rows_p1$includes_bulk))

From e609cd05056264741331ca5635b0bb7ae3049e2b Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 11:21:12 -0400
Subject: [PATCH 12/19] get full sample size

---
 R/datasets.R                   | 34 +++++++++++++++++----------
 tests/testthat/test-datasets.R | 42 +++++++++++++++++++---------------
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 2665f50..ea53bd2 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -631,13 +631,12 @@ remove_dataset_samples <- function(
 #' @importFrom dplyr .data
 #'
 #' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`,
-#'   `modality`, and `includes_bulk`
+#'   and `modality`
 make_dataset_data_df <- function(data) {
   empty <- data.frame(
     scpca_sample_id = character(),
     scpca_project_id = character(),
-    modality = character(),
-    includes_bulk = logical()
+    modality = character()
   )
   if (length(data) == 0) {
     return(empty)
@@ -645,7 +644,6 @@ make_dataset_data_df <- function(data) {
 
   result <- data |>
     purrr::imap(\(project, project_id) {
-      includes_bulk <- isTRUE(project$includes_bulk)
       single_cell_ids <- project$SINGLE_CELL
       # Datasets created outside this package may be merged.
       # projects are excluded here and surfaced via `merged_projects` in
@@ -665,8 +663,7 @@ make_dataset_data_df <- function(data) {
         modality = rep(
           c("single-cell", "spatial"),
           times = c(length(sc_ids), length(sp_ids))
-        ),
-        includes_bulk = includes_bulk
+        )
       )
     }) |>
     purrr::list_rbind() |>
@@ -697,14 +694,20 @@ make_dataset_data_df <- function(data) {
 #'   * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
 #'   * `status`: the processing status — one of "pending", "processing",
 #'     "succeeded", "failed", or "expired" (see [get_dataset_status()])
-#'   * `n_samples`: the number of rows in `samples` (one per sample-modality
-#'     combination; merged-single-cell projects are not counted)
+#'   * `n_samples`: the total number of samples in the dataset, taken from the
+#'     API's `total_sample_count`. This includes samples in merged projects,
+#'     which are not enumerated in `samples`, so `n_samples` can exceed
+#'     `nrow(samples)`.
 #'   * `n_projects`: the number of projects in the dataset
 #'   * `samples`: a data frame with one row per sample-modality combination and
-#'     columns `scpca_sample_id`, `scpca_project_id`, `modality` (character:
-#'     "single-cell" or "spatial"), and `includes_bulk` (logical)
+#'     columns `scpca_sample_id`, `scpca_project_id`, and `modality` (character:
+#'     "single-cell" or "spatial")
 #'   * `merged_projects`: a character vector of project IDs whose single-cell
 #'     data is merged; `character(0)` when none
+#'   * `bulk_projects`: a character vector of project IDs that include bulk
+#'     RNA-seq data; `character(0)` when none. Bulk inclusion is recorded per
+#'     project rather than per sample, so it is reported here rather than in
+#'     `samples`.
 #'
 #' @import httr2
 #' @export
@@ -725,15 +728,22 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"
     purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |>
     names() |>
     as.character()
+  bulk_projects <- detail$data |>
+    purrr::keep(\(p) isTRUE(p$includes_bulk)) |>
+    names() |>
+    as.character()
 
   list(
     id = detail$id,
     format = detail$format,
     status = dataset_status_from_detail(detail),
-    n_samples = nrow(samples),
+    # total_sample_count comes from the API and counts all samples, including
+    # those in merged projects that are not enumerated in `samples`.
+    n_samples = detail$total_sample_count,
     n_projects = length(detail$data),
     samples = samples,
-    merged_projects = merged_projects
+    merged_projects = merged_projects,
+    bulk_projects = bulk_projects
   )
 }
 
diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index 81f5b80..8d559ea 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -573,7 +573,8 @@ test_that("get_dataset_info returns structured summary with samples data frame",
             SPATIAL = list("SCPCS000004"),
             includes_bulk = TRUE
           )
-        )
+        ),
+        total_sample_count = 4
       )
     }
   )
@@ -584,13 +585,13 @@ test_that("get_dataset_info returns structured summary with samples data frame",
   expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
   expect_equal(info$status, "pending")
   expect_equal(info$n_projects, 2)
-  # 4 sample rows: 2 single-cell in the first project, 1 single-cell + 1 spatial in the second
+  # n_samples comes from the API total_sample_count (here equal to the 4 enumerated rows)
   expect_equal(info$n_samples, 4)
   expect_equal(info$merged_projects, character(0))
   expect_s3_class(info$samples, "data.frame")
   expect_setequal(
     colnames(info$samples),
-    c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk")
+    c("scpca_sample_id", "scpca_project_id", "modality")
   )
   # single-cell and spatial samples are distinct, each with its own modality row
   expect_equal(
@@ -601,12 +602,8 @@ test_that("get_dataset_info returns structured summary with samples data frame",
     info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"],
     "spatial"
   )
-  # SCPCP000001 samples should not have includes_bulk
-  rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ]
-  expect_false(all(rows_p1$includes_bulk))
-  # SCPCP000002 samples should have includes_bulk
-  rows_p2 <- info$samples[info$samples$scpca_project_id == "SCPCP000002", ]
-  expect_true(all(rows_p2$includes_bulk))
+  # bulk inclusion is reported per project, not per sample
+  expect_equal(info$bulk_projects, "SCPCP000002")
 })
 
 test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", {
@@ -616,7 +613,8 @@ test_that("get_dataset_info returns empty samples data frame with correct schema
         id = DATASET_ID,
         format = "ANN_DATA",
         is_started = FALSE,
-        data = list()
+        data = list(),
+        total_sample_count = 0
       )
     }
   )
@@ -626,9 +624,10 @@ test_that("get_dataset_info returns empty samples data frame with correct schema
   expect_equal(info$n_samples, 0)
   expect_equal(info$n_projects, 0)
   expect_equal(nrow(info$samples), 0)
+  expect_equal(info$bulk_projects, character(0))
   expect_setequal(
     colnames(info$samples),
-    c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk")
+    c("scpca_sample_id", "scpca_project_id", "modality")
   )
 })
 
@@ -650,17 +649,22 @@ test_that("get_dataset_info surfaces merged projects separately and excludes the
             SPATIAL = list(),
             includes_bulk = FALSE
           )
-        )
+        ),
+        # SCPCP000001 contributes 1 enumerated sample; the merged SCPCP000005
+        # contributes 3 samples that are not enumerated in `data`
+        total_sample_count = 4
       )
     }
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
 
-  # merged project excluded from samples and n_samples
-  expect_equal(info$n_samples, 1)
+  # merged project's samples are not enumerated in the samples table
+  expect_equal(nrow(info$samples), 1)
   expect_equal(info$samples$scpca_sample_id, "SCPCS000001")
-  # but counted in n_projects and surfaced in merged_projects
+  # but n_samples uses the API total_sample_count, which counts them
+  expect_equal(info$n_samples, 4)
+  # merged project counted in n_projects and surfaced in merged_projects
   expect_equal(info$n_projects, 2)
   expect_equal(info$merged_projects, "SCPCP000005")
 })
@@ -675,7 +679,8 @@ test_that("get_dataset_info derives status from detail without a second API call
         format = "ANN_DATA",
         is_started = TRUE,
         is_succeeded = TRUE,
-        data = list()
+        data = list(),
+        total_sample_count = 0
       )
     }
   )
@@ -704,14 +709,15 @@ test_that("get_dataset_info prunes projects where both modality lists are empty"
             SPATIAL = list(),
             includes_bulk = FALSE
           )
-        )
+        ),
+        total_sample_count = 1
       )
     }
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
 
-  expect_equal(info$n_samples, 1)
+  expect_equal(nrow(info$samples), 1)
   expect_equal(info$samples$scpca_project_id, "SCPCP000001")
   expect_false("SCPCP000002" %in% info$samples$scpca_project_id)
 })

From 8e277ff019af434740b7b9ce44cc2d2422795950 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 17:43:11 -0400
Subject: [PATCH 13/19] Give a more complete sample table

---
 R/datasets.R | 142 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 92 insertions(+), 50 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index ea53bd2..8503c55 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -619,24 +619,38 @@ remove_dataset_samples <- function(
 }
 
 
-#' Make a per-sample data frame from the `$data` list
-#'
-#' Transforms the project-keyed `$data` list from [get_dataset_detail()] into a
-#' one-row-per-sample data frame. Projects with merged single-cell data
-#' (`SINGLE_CELL = "MERGED"`) are excluded.
+#' Build the per-sample data frame for a dataset
+#'
+#' For each project in the dataset `$data` list, fetches the project's sample
+#' metadata with [get_project_samples()] and keeps only the samples the dataset includes:
+#' for a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
+#' and for a merged project, all of the project's single-cell samples.
+#' Each modality is reported only when it is requested for the sample:
+#' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
+#' sample is not included as single-cell),
+#' `has_spatial` marks spatial inclusion
+#' `has_bulk` reflects the project's `includes_bulk` request
+#' intersected with whether the sample actually has bulk data.
+#' `has_cite_seq` and `has_multiplexed` come from the sample records.
 #'
 #' @param data the project-keyed `$data` list from [get_dataset_detail()]
 #'
 #' @keywords internal
 #' @importFrom dplyr .data
 #'
-#' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`,
-#'   and `modality`
+#' @returns a data frame with one row per included sample and columns
+#'   `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character: "cell",
+#'   "nucleus", or `NA`), `has_spatial`, `has_bulk`, `has_cite_seq`, and
+#'   `has_multiplexed` (all logical)
 make_dataset_data_df <- function(data) {
-  empty <- data.frame(
+  empty <- tibble::tibble(
     scpca_sample_id = character(),
     scpca_project_id = character(),
-    modality = character()
+    seq_unit = character(),
+    has_spatial = logical(),
+    has_bulk = logical(),
+    has_cite_seq = logical(),
+    has_multiplexed = logical()
   )
   if (length(data) == 0) {
     return(empty)
@@ -644,27 +658,62 @@ make_dataset_data_df <- function(data) {
 
   result <- data |>
     purrr::imap(\(project, project_id) {
-      single_cell_ids <- project$SINGLE_CELL
-      # Datasets created outside this package may be merged.
-      # projects are excluded here and surfaced via `merged_projects` in
-      # get_dataset_info() instead.
-      if (identical(single_cell_ids, "MERGED")) {
-        return(NULL)
-      }
-      sc_ids <- as.character(single_cell_ids)
-      sp_ids <- as.character(project$SPATIAL)
-      if (length(sc_ids) == 0 && length(sp_ids) == 0) {
-        return(NULL)
+      merged <- identical(project$SINGLE_CELL, "MERGED")
+
+      # The project's sample metadata has the modality details we will need.
+      project_samples <- get_project_samples(project_id, simplify = FALSE)
+
+      # Get single cell samples for the project:
+      # - if merged from the projeect_samples metadata
+      # - if not merged, from the request list.
+      if (merged) {
+        single_cell_ids <- project_samples$scpca_sample_id[
+          project_samples$has_single_cell_data
+        ]
+      } else {
+        single_cell_ids <- as.character(project$SINGLE_CELL)
       }
 
-      data.frame(
-        scpca_sample_id = c(sc_ids, sp_ids),
-        scpca_project_id = project_id,
-        modality = rep(
-          c("single-cell", "spatial"),
-          times = c(length(sc_ids), length(sp_ids))
+      spatial_ids <- as.character(project$SPATIAL)
+      included_ids <- union(single_cell_ids, spatial_ids)
+      requested_bulk <- isTRUE(project$includes_bulk)
+
+      project_samples |>
+        # keep only the samples the dataset requests for this project
+        dplyr::filter(.data$scpca_sample_id %in% included_ids) |>
+        dplyr::mutate(
+          scpca_project_id = project_id,
+          # the single-cell sequencing unit (cell or nucleus), or NA when
+          # single-cell is not requested for the sample
+          seq_unit = purrr::map2_chr(
+            .data$seq_units,
+            .data$scpca_sample_id,
+            \(units, sample_id) {
+              if (!sample_id %in% single_cell_ids) {
+                return(NA_character_)
+              }
+              # get only the nucleus or cell (not spot or bulk)
+              # if both are present (unlikely), combine with a comma
+              intersect(c("cell", "nucleus"), as.character(units)) |>
+                paste(collapse = ",")
+            }
+          ),
+          # only modalities requested for the sample are reported; has_bulk also
+          # requires the sample to actually have bulk data
+          has_spatial = .data$scpca_sample_id %in% spatial_ids,
+          has_bulk = requested_bulk & .data$has_bulk_rna_seq,
+          has_cite_seq = .data$has_cite_seq_data,
+          has_multiplexed = .data$has_multiplexed_data
+        ) |>
+        dplyr::select(
+          "scpca_sample_id",
+          "scpca_project_id",
+          "seq_unit",
+          "has_spatial",
+          "has_bulk",
+          "has_cite_seq",
+          "has_multiplexed"
         )
-      )
     }) |>
     purrr::list_rbind() |>
     dplyr::arrange(.data$scpca_sample_id)
@@ -676,12 +725,14 @@ make_dataset_data_df <- function(data) {
 #' Get a summary of a custom ScPCA dataset
 #'
 #' Fetches a custom dataset and returns a structured summary of its contents,
-#' including its processing status and a per-sample table describing the modality for
-#' each sample.
+#' including its processing status and a per-sample table describing the modality
+#' of each sample.
 #'
-#' Projects with merged single-cell data (where individual sample IDs are not
-#' enumerated in the dataset record) are excluded from `samples` and listed in
-#' `merged_projects` instead.
+#' For each project, the included samples and their modality details are looked
+#' up from the project's sample records (one request per project), so merged
+#' projects (whose individual sample IDs are not enumerated in the dataset
+#' record) are expanded to all of their single-cell samples. Projects whose
+#' single-cell data is merged are also listed in `merged_projects`.
 #'
 #' @param dataset the dataset UUID string (such as the value returned by
 #'   [create_dataset()]), or a list with an `$id` element (such as the value
@@ -695,19 +746,16 @@ make_dataset_data_df <- function(data) {
 #'   * `status`: the processing status — one of "pending", "processing",
 #'     "succeeded", "failed", or "expired" (see [get_dataset_status()])
 #'   * `n_samples`: the total number of samples in the dataset, taken from the
-#'     API's `total_sample_count`. This includes samples in merged projects,
-#'     which are not enumerated in `samples`, so `n_samples` can exceed
-#'     `nrow(samples)`.
+#'     API's `total_sample_count`
 #'   * `n_projects`: the number of projects in the dataset
-#'   * `samples`: a data frame with one row per sample-modality combination and
-#'     columns `scpca_sample_id`, `scpca_project_id`, and `modality` (character:
-#'     "single-cell" or "spatial")
+#'   * `samples`: a data frame with one row per included sample and columns
+#'     `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character; the
+#'     single-cell sequencing unit "cell" or "nucleus", or `NA` when the sample
+#'     is not included as single-cell), `has_spatial`, `has_bulk` (whether the
+#'     dataset request includes bulk for that sample), `has_cite_seq`, and
+#'     `has_multiplexed` (all logical)
 #'   * `merged_projects`: a character vector of project IDs whose single-cell
 #'     data is merged; `character(0)` when none
-#'   * `bulk_projects`: a character vector of project IDs that include bulk
-#'     RNA-seq data; `character(0)` when none. Bulk inclusion is recorded per
-#'     project rather than per sample, so it is reported here rather than in
-#'     `samples`.
 #'
 #' @import httr2
 #' @export
@@ -728,22 +776,16 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"
     purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |>
     names() |>
     as.character()
-  bulk_projects <- detail$data |>
-    purrr::keep(\(p) isTRUE(p$includes_bulk)) |>
-    names() |>
-    as.character()
 
   list(
     id = detail$id,
     format = detail$format,
     status = dataset_status_from_detail(detail),
-    # total_sample_count comes from the API and counts all samples, including
-    # those in merged projects that are not enumerated in `samples`.
+    # total_sample_count comes from the API and counts all samples in the dataset.
     n_samples = detail$total_sample_count,
     n_projects = length(detail$data),
     samples = samples,
-    merged_projects = merged_projects,
-    bulk_projects = bulk_projects
+    merged_projects = merged_projects
   )
 }
 

From 5e91a291cdbb2c5d8971d2e1e189da3b6d5f39ac Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 17:43:36 -0400
Subject: [PATCH 14/19] update dataset tests

---
 tests/testthat/test-datasets.R | 189 ++++++++++++++++++++++++++++-----
 1 file changed, 163 insertions(+), 26 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index 8d559ea..af45212 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -554,7 +554,7 @@ test_that("get_dataset_status errors when auth_token is empty", {
 
 # get_dataset_info tests
 
-test_that("get_dataset_info returns structured summary with samples data frame", {
+test_that("get_dataset_info builds a per-sample table from project sample data", {
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
       list(
@@ -576,6 +576,32 @@ test_that("get_dataset_info returns structured summary with samples data frame",
         ),
         total_sample_count = 4
       )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      if (project_id == "SCPCP000001") {
+        # SCPCS000099 belongs to the project but is not in the dataset request
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000001", "SCPCS000002", "SCPCS000099"),
+          scpca_project_id = project_id,
+          has_single_cell_data = TRUE,
+          has_spatial_data = FALSE,
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell", "cell", "cell")
+        )
+      } else {
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000003", "SCPCS000004"),
+          scpca_project_id = project_id,
+          has_single_cell_data = c(TRUE, FALSE),
+          has_spatial_data = c(FALSE, TRUE),
+          has_bulk_rna_seq = c(TRUE, FALSE),
+          has_cite_seq_data = c(TRUE, FALSE),
+          has_multiplexed_data = c(FALSE, FALSE),
+          seq_units = list(c("cell", "bulk"), "spot")
+        )
+      }
     }
   )
 
@@ -585,25 +611,83 @@ test_that("get_dataset_info returns structured summary with samples data frame",
   expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
   expect_equal(info$status, "pending")
   expect_equal(info$n_projects, 2)
-  # n_samples comes from the API total_sample_count (here equal to the 4 enumerated rows)
   expect_equal(info$n_samples, 4)
   expect_equal(info$merged_projects, character(0))
+  expect_null(info$bulk_projects)
   expect_s3_class(info$samples, "data.frame")
   expect_setequal(
     colnames(info$samples),
-    c("scpca_sample_id", "scpca_project_id", "modality")
-  )
-  # single-cell and spatial samples are distinct, each with its own modality row
-  expect_equal(
-    info$samples$modality[info$samples$scpca_sample_id == "SCPCS000003"],
-    "single-cell"
+    c(
+      "scpca_sample_id",
+      "scpca_project_id",
+      "seq_unit",
+      "has_spatial",
+      "has_bulk",
+      "has_cite_seq",
+      "has_multiplexed"
+    )
   )
-  expect_equal(
-    info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"],
-    "spatial"
+  # one row per included sample; the unrequested SCPCS000099 is filtered out
+  expect_equal(nrow(info$samples), 4)
+  expect_false("SCPCS000099" %in% info$samples$scpca_sample_id)
+
+  field <- \(col, id) info$samples[[col]][info$samples$scpca_sample_id == id]
+  # seq_unit is the single-cell unit, or NA for a spatial-only sample
+  expect_equal(field("seq_unit", "SCPCS000001"), "cell")
+  expect_equal(field("seq_unit", "SCPCS000003"), "cell")
+  expect_true(is.na(field("seq_unit", "SCPCS000004")))
+
+  # only requested modalities are reported
+  expect_true(field("has_spatial", "SCPCS000004"))
+  expect_false(field("has_spatial", "SCPCS000001"))
+
+  expect_true(field("has_cite_seq", "SCPCS000003"))
+  expect_false(field("has_cite_seq", "SCPCS000001"))
+
+  # has_bulk reflects the request AND availability
+  expect_true(field("has_bulk", "SCPCS000003")) # requested + available
+  expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk
+  expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none
+  expect_false(any(info$samples$has_multiplexed))
+})
+
+test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list("SCPCS000001"),
+            includes_bulk = FALSE
+          )
+        ),
+        total_sample_count = 1
+      )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      tibble::tibble(
+        scpca_sample_id = "SCPCS000001",
+        scpca_project_id = project_id,
+        has_single_cell_data = TRUE,
+        has_spatial_data = TRUE,
+        has_bulk_rna_seq = FALSE,
+        has_cite_seq_data = FALSE,
+        has_multiplexed_data = FALSE,
+        seq_units = list(c("cell", "spot"))
+      )
+    }
   )
-  # bulk inclusion is reported per project, not per sample
-  expect_equal(info$bulk_projects, "SCPCP000002")
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  # one row for the sample: single-cell unit plus spatial
+  expect_equal(nrow(info$samples), 1)
+  expect_equal(info$samples$seq_unit, "cell")
+  expect_true(info$samples$has_spatial)
 })
 
 test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", {
@@ -624,14 +708,22 @@ test_that("get_dataset_info returns empty samples data frame with correct schema
   expect_equal(info$n_samples, 0)
   expect_equal(info$n_projects, 0)
   expect_equal(nrow(info$samples), 0)
-  expect_equal(info$bulk_projects, character(0))
+  expect_null(info$bulk_projects)
   expect_setequal(
     colnames(info$samples),
-    c("scpca_sample_id", "scpca_project_id", "modality")
+    c(
+      "scpca_sample_id",
+      "scpca_project_id",
+      "seq_unit",
+      "has_spatial",
+      "has_bulk",
+      "has_cite_seq",
+      "has_multiplexed"
+    )
   )
 })
 
-test_that("get_dataset_info surfaces merged projects separately and excludes them from samples", {
+test_that("get_dataset_info expands merged projects to all their single-cell samples", {
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
       list(
@@ -650,23 +742,55 @@ test_that("get_dataset_info surfaces merged projects separately and excludes the
             includes_bulk = FALSE
           )
         ),
-        # SCPCP000001 contributes 1 enumerated sample; the merged SCPCP000005
-        # contributes 3 samples that are not enumerated in `data`
         total_sample_count = 4
       )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      if (project_id == "SCPCP000001") {
+        tibble::tibble(
+          scpca_sample_id = "SCPCS000001",
+          scpca_project_id = project_id,
+          has_single_cell_data = TRUE,
+          has_spatial_data = FALSE,
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell")
+        )
+      } else {
+        # merged project: all single-cell samples are included; the
+        # non-single-cell SCPCS000053 is not
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000050", "SCPCS000051", "SCPCS000052", "SCPCS000053"),
+          scpca_project_id = project_id,
+          has_single_cell_data = c(TRUE, TRUE, TRUE, FALSE),
+          has_spatial_data = c(FALSE, FALSE, FALSE, TRUE),
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell", "cell", "nucleus", "spot")
+        )
+      }
     }
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
 
-  # merged project's samples are not enumerated in the samples table
-  expect_equal(nrow(info$samples), 1)
-  expect_equal(info$samples$scpca_sample_id, "SCPCS000001")
-  # but n_samples uses the API total_sample_count, which counts them
-  expect_equal(info$n_samples, 4)
-  # merged project counted in n_projects and surfaced in merged_projects
-  expect_equal(info$n_projects, 2)
+  # merged project still surfaced in merged_projects
   expect_equal(info$merged_projects, "SCPCP000005")
+  # its single-cell samples are expanded into the table; SCPCS000053 is excluded
+  expect_setequal(
+    info$samples$scpca_sample_id,
+    c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052")
+  )
+  expect_false("SCPCS000053" %in% info$samples$scpca_sample_id)
+  # the nucleus seq_unit is reported for that sample
+  expect_equal(
+    info$samples$seq_unit[info$samples$scpca_sample_id == "SCPCS000052"],
+    "nucleus"
+  )
+  expect_equal(info$n_projects, 2)
+  expect_equal(info$n_samples, 4)
 })
 
 test_that("get_dataset_info derives status from detail without a second API call", {
@@ -691,7 +815,7 @@ test_that("get_dataset_info derives status from detail without a second API call
   expect_equal(info$status, "succeeded")
 })
 
-test_that("get_dataset_info prunes projects where both modality lists are empty", {
+test_that("get_dataset_info prunes projects where nothing is requested", {
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
       list(
@@ -712,6 +836,19 @@ test_that("get_dataset_info prunes projects where both modality lists are empty"
         ),
         total_sample_count = 1
       )
+    },
+    # only SCPCP000001 should be queried; SCPCP000002 requests nothing
+    get_project_samples = \(project_id, simplify = TRUE) {
+      tibble::tibble(
+        scpca_sample_id = "SCPCS000001",
+        scpca_project_id = project_id,
+        has_single_cell_data = TRUE,
+        has_spatial_data = FALSE,
+        has_bulk_rna_seq = FALSE,
+        has_cite_seq_data = FALSE,
+        has_multiplexed_data = FALSE,
+        seq_units = list("cell")
+      )
     }
   )
 

From 8ce07981a15f028f2e8ce4dd593e3a5bea3aa5c3 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 17:59:11 -0400
Subject: [PATCH 15/19] update docs

---
 R/datasets.R                | 14 ++++++++------
 man/get_dataset_info.Rd     | 29 +++++++++++++++++++----------
 man/make_dataset_data_df.Rd | 22 ++++++++++++++++------
 3 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 8503c55..766520d 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -748,12 +748,14 @@ make_dataset_data_df <- function(data) {
 #'   * `n_samples`: the total number of samples in the dataset, taken from the
 #'     API's `total_sample_count`
 #'   * `n_projects`: the number of projects in the dataset
-#'   * `samples`: a data frame with one row per included sample and columns
-#'     `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character; the
-#'     single-cell sequencing unit "cell" or "nucleus", or `NA` when the sample
-#'     is not included as single-cell), `has_spatial`, `has_bulk` (whether the
-#'     dataset request includes bulk for that sample), `has_cite_seq`, and
-#'     `has_multiplexed` (all logical)
+#'   * `samples`: a data frame with one row per included sample and the following columns:
+#'     - `scpca_sample_id`
+#'     - `scpca_project_id`
+#'     - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell)
+#'     - `has_spatial`
+#'     - `has_bulk`
+#'     - `has_cite_seq`
+#'     - `has_multiplexed`
 #'   * `merged_projects`: a character vector of project IDs whose single-cell
 #'     data is merged; `character(0)` when none
 #'
diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd
index c73a65a..d4d9753 100644
--- a/man/get_dataset_info.Rd
+++ b/man/get_dataset_info.Rd
@@ -21,25 +21,34 @@ a named list with the following elements:
 \item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
 \item \code{status}: the processing status — one of "pending", "processing",
 "succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}})
-\item \code{n_samples}: the number of rows in \code{samples} (one per sample-modality
-combination; merged-single-cell projects are not counted)
+\item \code{n_samples}: the total number of samples in the dataset, taken from the
+API's \code{total_sample_count}
 \item \code{n_projects}: the number of projects in the dataset
-\item \code{samples}: a data frame with one row per sample-modality combination and
-columns \code{scpca_sample_id}, \code{scpca_project_id}, \code{modality} (character:
-"single-cell" or "spatial"), and \code{includes_bulk} (logical)
+\item \code{samples}: a data frame with one row per included sample and the following columns:
+\itemize{
+\item \code{scpca_sample_id}
+\item \code{scpca_project_id}
+\item \code{seq_unit} ("cell" or "nucleus", or \code{NA} if the sample is not included as single-cell)
+\item \code{has_spatial}
+\item \code{has_bulk}
+\item \code{has_cite_seq}
+\item \code{has_multiplexed}
+}
 \item \code{merged_projects}: a character vector of project IDs whose single-cell
 data is merged; \code{character(0)} when none
 }
 }
 \description{
 Fetches a custom dataset and returns a structured summary of its contents,
-including its processing status and a per-sample table describing the modality for
-each sample.
+including its processing status and a per-sample table describing the modality
+of each sample.
 }
 \details{
-Projects with merged single-cell data (where individual sample IDs are not
-enumerated in the dataset record) are excluded from \code{samples} and listed in
-\code{merged_projects} instead.
+For each project, the included samples and their modality details are looked
+up from the project's sample records (one request per project), so merged
+projects (whose individual sample IDs are not enumerated in the dataset
+record) are expanded to all of their single-cell samples. Projects whose
+single-cell data is merged are also listed in \code{merged_projects}.
 }
 \examples{
 \dontrun{
diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd
index 583bba4..73d59d9 100644
--- a/man/make_dataset_data_df.Rd
+++ b/man/make_dataset_data_df.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/datasets.R
 \name{make_dataset_data_df}
 \alias{make_dataset_data_df}
-\title{Make a per-sample data frame from the \verb{$data} list}
+\title{Build the per-sample data frame for a dataset}
 \usage{
 make_dataset_data_df(data)
 }
@@ -10,12 +10,22 @@ make_dataset_data_df(data)
 \item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}}
 }
 \value{
-a data frame with columns \code{scpca_sample_id}, \code{scpca_project_id},
-\code{modality}, and \code{includes_bulk}
+a data frame with one row per included sample and columns
+\code{scpca_sample_id}, \code{scpca_project_id}, \code{seq_unit} (character: "cell",
+"nucleus", or \code{NA}), \code{has_spatial}, \code{has_bulk}, \code{has_cite_seq}, and
+\code{has_multiplexed} (all logical)
 }
 \description{
-Transforms the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}} into a
-one-row-per-sample data frame. Projects with merged single-cell data
-(\code{SINGLE_CELL = "MERGED"}) are excluded.
+For each project in the dataset \verb{$data} list, fetches the project's sample
+metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples the dataset includes:
+for a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL},
+and for a merged project, all of the project's single-cell samples.
+Each modality is reported only when it is requested for the sample:
+\code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the
+sample is not included as single-cell),
+\code{has_spatial} marks spatial inclusion
+\code{has_bulk} reflects the project's \code{includes_bulk} request
+intersected with whether the sample actually has bulk data.
+\code{has_cite_seq} and \code{has_multiplexed} come from the sample records.
 }
 \keyword{internal}

From 3c8a7b4c91e41c9c0803e6e588e7d67374bfd76e Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Tue, 9 Jun 2026 21:18:48 -0400
Subject: [PATCH 16/19] re-expand add and remove tests.

---
 tests/testthat/test-datasets.R | 116 ++++++++++++++++++++++++++++++---
 1 file changed, 107 insertions(+), 9 deletions(-)

diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index af45212..8adbce9 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -1227,29 +1227,127 @@ test_that("remove_from_dataset_data drops whole projects", {
   expect_equal(names(result), "SCPCP000001")
 })
 
+test_that("remove_from_dataset_data errors when removing a sample from a merged project", {
+  existing <- list(
+    SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE)
+  )
+
+  expect_error(
+    remove_from_dataset_data(existing, samples = "SCPCS000001"),
+    "merged single-cell data"
+  )
+})
+
+test_that("remove_from_dataset_data can drop a merged project wholesale", {
+  existing <- list(
+    SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE),
+    SCPCP000002 = list(SINGLE_CELL = list("SCPCS000003"), SPATIAL = list(), includes_bulk = FALSE)
+  )
+
+  result <- remove_from_dataset_data(existing, projects = "SCPCP000001")
+  expect_equal(names(result), "SCPCP000002")
+})
+
 # add_dataset_samples / remove_dataset_samples tests
 
-test_that("add_dataset_samples PUTs", {
+test_that("add_dataset_samples PUTs the merged data", {
+  captured_req <- NULL
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
-      list(id = DATASET_ID, data = list())
+      list(
+        id = DATASET_ID,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        )
+      )
     },
-    build_dataset_data = \(...) list(),
-    req_perform = \(req, ...) json_response(list(id = DATASET_ID))
+    # additions: one sample for the existing project, plus a brand-new project
+    build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) {
+      list(
+        SCPCP000001 = list(
+          SINGLE_CELL = list("SCPCS000002"),
+          SPATIAL = list(),
+          includes_bulk = include_bulk
+        ),
+        SCPCP000002 = list(
+          SINGLE_CELL = list("SCPCS000003"),
+          SPATIAL = list(),
+          includes_bulk = include_bulk
+        )
+      )
+    },
+    req_perform = \(req, ...) {
+      captured_req <<- req
+      json_response(req$body$data)
+    }
+  )
+
+  result <- add_dataset_samples(
+    DATASET_ID,
+    auth_token = "token",
+    samples = c("SCPCS000002", "SCPCS000003"),
+    include_bulk = TRUE
   )
 
-  result <- add_dataset_samples(DATASET_ID, auth_token = "token", samples = "SCPCS000002")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
+
+  sent_data <- captured_req$body$data$data
+  expect_setequal(names(sent_data), c("SCPCP000001", "SCPCP000002"))
+  # existing project gains the new sample as a union of old and added IDs
+  expect_setequal(
+    as.character(sent_data$SCPCP000001$SINGLE_CELL),
+    c("SCPCS000001", "SCPCS000002")
+  )
+  # include_bulk applies to the newly added project but not the existing one
+  expect_false(sent_data$SCPCP000001$includes_bulk)
+  expect_true(sent_data$SCPCP000002$includes_bulk)
   expect_equal(result, DATASET_ID)
 })
 
-test_that("remove_dataset_samples PUTs", {
+test_that("remove_dataset_samples PUTs the reduced data", {
+  captured_req <- NULL
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
-      list(id = DATASET_ID, data = list())
+      list(
+        id = DATASET_ID,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000002 = list(
+            SINGLE_CELL = list("SCPCS000003"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        )
+      )
     },
-    req_perform = \(req, ...) json_response(list(id = DATASET_ID))
+    req_perform = \(req, ...) {
+      captured_req <<- req
+      json_response(req$body$data)
+    }
   )
 
-  result <- remove_dataset_samples(DATASET_ID, auth_token = "token", projects = "SCPCP000002")
+  result <- remove_dataset_samples(
+    DATASET_ID,
+    auth_token = "token",
+    samples = "SCPCS000002",
+    projects = "SCPCP000002"
+  )
+
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
+
+  # SCPCP000002 dropped wholesale; SCPCP000001 keeps only the un-removed sample
+  sent_data <- captured_req$body$data$data
+  expect_equal(names(sent_data), "SCPCP000001")
+  expect_equal(as.character(sent_data$SCPCP000001$SINGLE_CELL), "SCPCS000001")
   expect_equal(result, DATASET_ID)
 })

From 4ca9ef61f1a6d56bec183fc9d4845b7e6139133b Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Jun 2026 10:11:07 -0400
Subject: [PATCH 17/19] Apply suggestions from code review

Co-authored-by: Stephanie J. Spielman <stephanie.spielman@gmail.com>
---
 R/datasets.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 766520d..27c2088 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -622,8 +622,8 @@ remove_dataset_samples <- function(
 #' Build the per-sample data frame for a dataset
 #'
 #' For each project in the dataset `$data` list, fetches the project's sample
-#' metadata with [get_project_samples()] and keeps only the samples the dataset includes:
-#' for a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
+#' metadata with [get_project_samples()] and keeps only the samples that the dataset includes:
+#' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
 #' and for a merged project, all of the project's single-cell samples.
 #' Each modality is reported only when it is requested for the sample:
 #' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
@@ -664,7 +664,7 @@ make_dataset_data_df <- function(data) {
       project_samples <- get_project_samples(project_id, simplify = FALSE)
 
       # Get single cell samples for the project:
-      # - if merged from the projeect_samples metadata
+      # - if merged from the project_samples metadata
       # - if not merged, from the request list.
       if (merged) {
         single_cell_ids <- project_samples$scpca_sample_id[

From 477c187b9d18ecbc32a2445df7a81428fbde245b Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Jun 2026 10:24:44 -0400
Subject: [PATCH 18/19] change $samples to $sample_info in get_dataset_info

update tests to match (and be more conservative on the slot name)
---
 R/datasets.R                   | 18 ++++++++--------
 tests/testthat/test-datasets.R | 39 +++++++++++++++++++---------------
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 27c2088..69fe444 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -626,12 +626,12 @@ remove_dataset_samples <- function(
 #' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
 #' and for a merged project, all of the project's single-cell samples.
 #' Each modality is reported only when it is requested for the sample:
-#' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
+#' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
 #' sample is not included as single-cell),
-#' `has_spatial` marks spatial inclusion
-#' `has_bulk` reflects the project's `includes_bulk` request
-#' intersected with whether the sample actually has bulk data.
-#' `has_cite_seq` and `has_multiplexed` come from the sample records.
+#' - `has_spatial` marks spatial inclusion
+#' - `has_bulk` reflects the project's `includes_bulk` request
+#'    intersected with whether the sample actually has bulk data.
+#' - `has_cite_seq` and `has_multiplexed` come from the sample records.
 #'
 #' @param data the project-keyed `$data` list from [get_dataset_detail()]
 #'
@@ -748,7 +748,7 @@ make_dataset_data_df <- function(data) {
 #'   * `n_samples`: the total number of samples in the dataset, taken from the
 #'     API's `total_sample_count`
 #'   * `n_projects`: the number of projects in the dataset
-#'   * `samples`: a data frame with one row per included sample and the following columns:
+#'   * `sample_info`: a data frame with one row per included sample and the following columns:
 #'     - `scpca_sample_id`
 #'     - `scpca_project_id`
 #'     - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell)
@@ -767,13 +767,13 @@ make_dataset_data_df <- function(data) {
 #' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
 #' info <- get_dataset_info(ds_id)
 #' info$status
-#' info$samples
+#' info$sample_info
 #' }
 get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
   auth_token <- resolve_auth_token(auth_token)
   detail <- get_dataset_detail(dataset, auth_token)
 
-  samples <- make_dataset_data_df(detail$data)
+  samples_df <- make_dataset_data_df(detail$data)
   merged_projects <- detail$data |>
     purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |>
     names() |>
@@ -786,7 +786,7 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"
     # total_sample_count comes from the API and counts all samples in the dataset.
     n_samples = detail$total_sample_count,
     n_projects = length(detail$data),
-    samples = samples,
+    sample_info = samples_df,
     merged_projects = merged_projects
   )
 }
diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index 8adbce9..e3c6ef4 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -606,6 +606,7 @@ test_that("get_dataset_info builds a per-sample table from project sample data",
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
 
   expect_equal(info$id, DATASET_ID)
   expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
@@ -614,9 +615,9 @@ test_that("get_dataset_info builds a per-sample table from project sample data",
   expect_equal(info$n_samples, 4)
   expect_equal(info$merged_projects, character(0))
   expect_null(info$bulk_projects)
-  expect_s3_class(info$samples, "data.frame")
+  expect_s3_class(sample_info, "data.frame")
   expect_setequal(
-    colnames(info$samples),
+    colnames(sample_info),
     c(
       "scpca_sample_id",
       "scpca_project_id",
@@ -628,10 +629,10 @@ test_that("get_dataset_info builds a per-sample table from project sample data",
     )
   )
   # one row per included sample; the unrequested SCPCS000099 is filtered out
-  expect_equal(nrow(info$samples), 4)
-  expect_false("SCPCS000099" %in% info$samples$scpca_sample_id)
+  expect_equal(nrow(sample_info), 4)
+  expect_false("SCPCS000099" %in% sample_info$scpca_sample_id)
 
-  field <- \(col, id) info$samples[[col]][info$samples$scpca_sample_id == id]
+  field <- \(col, id) sample_info[[col]][sample_info$scpca_sample_id == id]
   # seq_unit is the single-cell unit, or NA for a spatial-only sample
   expect_equal(field("seq_unit", "SCPCS000001"), "cell")
   expect_equal(field("seq_unit", "SCPCS000003"), "cell")
@@ -648,7 +649,7 @@ test_that("get_dataset_info builds a per-sample table from project sample data",
   expect_true(field("has_bulk", "SCPCS000003")) # requested + available
   expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk
   expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none
-  expect_false(any(info$samples$has_multiplexed))
+  expect_false(any(sample_info$has_multiplexed))
 })
 
 test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", {
@@ -683,11 +684,12 @@ test_that("get_dataset_info combines modalities for a sample included as single-
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
 
   # one row for the sample: single-cell unit plus spatial
-  expect_equal(nrow(info$samples), 1)
-  expect_equal(info$samples$seq_unit, "cell")
-  expect_true(info$samples$has_spatial)
+  expect_equal(nrow(sample_info), 1)
+  expect_equal(sample_info$seq_unit, "cell")
+  expect_true(sample_info$has_spatial)
 })
 
 test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", {
@@ -704,13 +706,14 @@ test_that("get_dataset_info returns empty samples data frame with correct schema
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
 
   expect_equal(info$n_samples, 0)
   expect_equal(info$n_projects, 0)
-  expect_equal(nrow(info$samples), 0)
+  expect_equal(nrow(sample_info), 0)
   expect_null(info$bulk_projects)
   expect_setequal(
-    colnames(info$samples),
+    colnames(sample_info),
     c(
       "scpca_sample_id",
       "scpca_project_id",
@@ -775,18 +778,19 @@ test_that("get_dataset_info expands merged projects to all their single-cell sam
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
 
   # merged project still surfaced in merged_projects
   expect_equal(info$merged_projects, "SCPCP000005")
   # its single-cell samples are expanded into the table; SCPCS000053 is excluded
   expect_setequal(
-    info$samples$scpca_sample_id,
+    sample_info$scpca_sample_id,
     c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052")
   )
-  expect_false("SCPCS000053" %in% info$samples$scpca_sample_id)
+  expect_false("SCPCS000053" %in% sample_info$scpca_sample_id)
   # the nucleus seq_unit is reported for that sample
   expect_equal(
-    info$samples$seq_unit[info$samples$scpca_sample_id == "SCPCS000052"],
+    sample_info$seq_unit[sample_info$scpca_sample_id == "SCPCS000052"],
     "nucleus"
   )
   expect_equal(info$n_projects, 2)
@@ -853,10 +857,11 @@ test_that("get_dataset_info prunes projects where nothing is requested", {
   )
 
   info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
 
-  expect_equal(nrow(info$samples), 1)
-  expect_equal(info$samples$scpca_project_id, "SCPCP000001")
-  expect_false("SCPCP000002" %in% info$samples$scpca_project_id)
+  expect_equal(nrow(sample_info), 1)
+  expect_equal(sample_info$scpca_project_id, "SCPCP000001")
+  expect_false("SCPCP000002" %in% sample_info$scpca_project_id)
 })
 
 test_that("get_dataset_info errors when auth_token is empty", {

From 47048423f0ecf5f8f9e3224e3e8e0dfa7107de97 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Jun 2026 10:32:18 -0400
Subject: [PATCH 19/19] docs updates

---
 R/datasets.R                | 13 +++++++------
 man/get_dataset_info.Rd     |  4 ++--
 man/make_dataset_data_df.Rd | 21 ++++++++++++---------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 69fe444..e4a7c9b 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -625,13 +625,14 @@ remove_dataset_samples <- function(
 #' metadata with [get_project_samples()] and keeps only the samples that the dataset includes:
 #' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
 #' and for a merged project, all of the project's single-cell samples.
-#' Each modality is reported only when it is requested for the sample:
+#' Each modality flag is reported only as TRUE for samples that are both included in the dataset
+#' and actually have that modality available:
 #' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
-#' sample is not included as single-cell),
-#' - `has_spatial` marks spatial inclusion
-#' - `has_bulk` reflects the project's `includes_bulk` request
-#'    intersected with whether the sample actually has bulk data.
-#' - `has_cite_seq` and `has_multiplexed` come from the sample records.
+#' sample is not included as single-cell)
+#' - `has_spatial` marks spatial inclusion, if requested, for the sample or project
+#' - `has_bulk` indicates that the sample is present in the bulk data table, if requested for a project.
+#' - `has_cite_seq` and `has_multiplexed` come from the sample records
+#'    and do not depend on the specific request
 #'
 #' @param data the project-keyed `$data` list from [get_dataset_detail()]
 #'
diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd
index d4d9753..c032595 100644
--- a/man/get_dataset_info.Rd
+++ b/man/get_dataset_info.Rd
@@ -24,7 +24,7 @@ a named list with the following elements:
 \item \code{n_samples}: the total number of samples in the dataset, taken from the
 API's \code{total_sample_count}
 \item \code{n_projects}: the number of projects in the dataset
-\item \code{samples}: a data frame with one row per included sample and the following columns:
+\item \code{sample_info}: a data frame with one row per included sample and the following columns:
 \itemize{
 \item \code{scpca_sample_id}
 \item \code{scpca_project_id}
@@ -55,6 +55,6 @@ single-cell data is merged are also listed in \code{merged_projects}.
 ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
 info <- get_dataset_info(ds_id)
 info$status
-info$samples
+info$sample_info
 }
 }
diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd
index 73d59d9..4c55760 100644
--- a/man/make_dataset_data_df.Rd
+++ b/man/make_dataset_data_df.Rd
@@ -17,15 +17,18 @@ a data frame with one row per included sample and columns
 }
 \description{
 For each project in the dataset \verb{$data} list, fetches the project's sample
-metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples the dataset includes:
-for a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL},
+metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples that the dataset includes:
+For a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL},
 and for a merged project, all of the project's single-cell samples.
-Each modality is reported only when it is requested for the sample:
-\code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the
-sample is not included as single-cell),
-\code{has_spatial} marks spatial inclusion
-\code{has_bulk} reflects the project's \code{includes_bulk} request
-intersected with whether the sample actually has bulk data.
-\code{has_cite_seq} and \code{has_multiplexed} come from the sample records.
+Each modality flag is reported only as TRUE for samples that are both included in the dataset
+and actually have that modality available:
+\itemize{
+\item \code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the
+sample is not included as single-cell)
+\item \code{has_spatial} marks spatial inclusion, if requested, for the sample or project
+\item \code{has_bulk} indicates that the sample is present in the bulk data table, if requested for a project.
+\item \code{has_cite_seq} and \code{has_multiplexed} come from the sample records
+and do not depend on the specific request
+}
 }
 \keyword{internal}