diff --git a/.Rbuildignore b/.Rbuildignore
index d3bd575..a90605e 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -2,6 +2,7 @@
 ^LICENSE\.md$
 ^ScPCAr\.Rproj$
 ^\.Rproj\.user$
+^\.claude$
 ^\.github$
 ^\.pre-commit-config\.yaml$
 ^_pkgdown\.yml$
diff --git a/NAMESPACE b/NAMESPACE
index 28593d4..66eb4ea 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(download_dataset)
 export(download_project)
 export(download_sample)
 export(get_auth)
+export(get_dataset_info)
 export(get_dataset_status)
 export(get_project_info)
 export(get_project_libraries)
diff --git a/R/datasets.R b/R/datasets.R
index db66829..e4a7c9b 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -47,15 +47,16 @@ build_dataset_data <- function(samples = NULL, projects = NULL, include_bulk = F
 
 #' Resolve a dataset identifier to its ID string
 #'
-#' Accepts either a dataset UUID string or a list with an `$id` element (such as
-#' the return value of [create_dataset()] or [get_dataset_detail()]) and returns
-#' the ID string, after checking that it is a valid UUID.
+#' Accepts either a dataset UUID string (such as the value returned by
+#' [create_dataset()]) or a list with an `$id` element (such as the value returned
+#' by [get_dataset_detail()]) and returns the ID string, after checking that it is
+#' a valid UUID.
 #'
 #' @param dataset a dataset UUID string, or a list with an `$id` element
 #'
 #' @keywords internal
 #'
-#' @returns the dataset ID as a length-1 character string
+#' @returns the dataset ID as a character string
 resolve_dataset_id <- function(dataset) {
   if (is.list(dataset)) {
     stopifnot("dataset must be an id string or contain an $id element" = !is.null(dataset$id))
@@ -112,7 +113,9 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' Create a custom dataset on the ScPCA Portal
 #'
 #' Creates a new user dataset without starting processing.
-#' The returned list includes the dataset `$id` along with its current contents and status.
+#' Returns the new dataset's ID (invisibly), which you can pass to the other
+#' dataset functions such as [get_dataset_info()], [add_dataset_samples()], and
+#' [start_dataset_processing()].
 #'
 #' @param samples optional character vector of ScPCA sample IDs (e.g. "SCPCS000001")
 #' @param projects optional character vector of ScPCA project IDs (e.g. "SCPCP000001");
@@ -125,7 +128,7 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the API response as a list (invisibly), including the dataset `$id`
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @import httr2
 #' @export
@@ -133,11 +136,11 @@ update_dataset <- function(dataset_id, body, auth_token) {
 #' @examples
 #' \dontrun{
 #' token <- get_auth("user@example.com", agree = TRUE)
-#' ds <- create_dataset(
+#' ds_id <- create_dataset(
 #'   auth_token = token,
 #'   samples = c("SCPCS000001", "SCPCS000002")
 #' )
-#' ds$id
+#' ds_id
 #' }
 create_dataset <- function(
   samples = NULL,
@@ -179,7 +182,7 @@ create_dataset <- function(
     resp_body_json()
 
   message(glue::glue("ScPCA dataset {response$id} created."))
-  invisible(response)
+  invisible(response$id)
 }
 
 
@@ -193,8 +196,9 @@ create_dataset <- function(
 #' it is also used by the dataset modification functions to fetch current
 #' contents before updating.
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param auth_token an authorization token obtained from [get_auth()];
 #'  must match the token used to create the dataset.
 #'
@@ -218,6 +222,27 @@ get_dataset_detail <- function(dataset, auth_token) {
     resp_body_json()
 }
 
+#' Map dataset detail status flags to a status string
+#'
+#' @param detail the dataset detail list returned by [get_dataset_detail()]
+#'
+#' @keywords internal
+#'
+#' @returns a single character string: one of "pending", "processing",
+#'   "succeeded", "failed", or "expired"
+dataset_status_from_detail <- function(detail) {
+  if (isTRUE(detail$is_failed)) {
+    "failed"
+  } else if (isTRUE(detail$is_expired)) {
+    "expired"
+  } else if (isTRUE(detail$is_succeeded)) {
+    "succeeded"
+  } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
+    "processing"
+  } else {
+    "pending"
+  }
+}
 
 #' Get the processing status of a custom dataset
 #'
@@ -233,8 +258,9 @@ get_dataset_detail <- function(dataset, auth_token) {
 #'   expired and must be regenerated
 #' * `"failed"`: processing failed
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
@@ -246,22 +272,12 @@ get_dataset_detail <- function(dataset, auth_token) {
 #'
 #' @examples
 #' \dontrun{
-#' get_dataset_status(ds)
+#' get_dataset_status(ds_id)
 #' }
 get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
   auth_token <- resolve_auth_token(auth_token)
   detail <- get_dataset_detail(dataset, auth_token)
-  if (isTRUE(detail$is_failed)) {
-    "failed"
-  } else if (isTRUE(detail$is_expired)) {
-    "expired"
-  } else if (isTRUE(detail$is_succeeded)) {
-    "succeeded"
-  } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) {
-    "processing"
-  } else {
-    "pending"
-  }
+  dataset_status_from_detail(detail)
 }
 
 
@@ -283,13 +299,13 @@ get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKE
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002"))
+#' replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002"))
 #' }
 replace_dataset_data <- function(
   dataset,
@@ -308,8 +324,8 @@ replace_dataset_data <- function(
 
   data <- build_dataset_data(samples = samples, projects = projects, include_bulk = include_bulk)
 
-  response <- update_dataset(dataset_id, list(data = data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = data), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -326,13 +342,13 @@ replace_dataset_data <- function(
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' set_dataset_email(ds, email = "user@example.com")
+#' set_dataset_email(ds_id, email = "user@example.com")
 #' }
 set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
   auth_token <- resolve_auth_token(auth_token)
@@ -343,8 +359,8 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT
   )
   dataset_id <- resolve_dataset_id(dataset)
 
-  response <- update_dataset(dataset_id, list(email = email), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(email = email), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -362,24 +378,23 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT
 #' * A `"processing"` or `"succeeded"` dataset is already underway or done;
 #'   a message is emitted and no request is sent.
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param email optional email address for the download notification. When
 #'   supplied, it is set as part of the same request that starts processing.
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly) when a request is
-#'   sent, or `NULL` (invisibly) when the dataset is already processing or
-#'   completed.
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @import httr2
 #' @export
 #'
 #' @examples
 #' \dontrun{
-#' ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
-#' start_dataset_processing(ds, email = "user@example.com")
+#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+#' start_dataset_processing(ds_id, email = "user@example.com")
 #' }
 start_dataset_processing <- function(
   dataset,
@@ -401,11 +416,11 @@ start_dataset_processing <- function(
   # continue without message for "pending" or "expired"
   if (status == "processing") {
     message(glue::glue("ScPCA dataset {dataset_id} is already processing."))
-    return(invisible(NULL))
+    return(invisible(dataset_id))
   }
   if (status == "succeeded") {
     message(glue::glue("ScPCA dataset {dataset_id} has already completed processing."))
-    return(invisible(NULL))
+    return(invisible(dataset_id))
   }
   if (status == "failed") {
     warning(
@@ -419,9 +434,9 @@ start_dataset_processing <- function(
     body$email <- email
   }
 
-  response <- update_dataset(dataset_id, body, auth_token = auth_token)
+  update_dataset(dataset_id, body, auth_token = auth_token)
   message(glue::glue("ScPCA dataset {dataset_id} processing started."))
-  invisible(response)
+  invisible(dataset_id)
 }
 
 
@@ -540,7 +555,7 @@ remove_from_dataset_data <- function(existing, samples = NULL, projects = NULL)
 #' @param auth_token an authorization token from [get_auth()]. Defaults to the
 #'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
 #'
-#' @returns the updated dataset detail as a list (invisibly)
+#' @returns the dataset ID as a character string (invisibly)
 #'
 #' @rdname modify_dataset_samples
 #' @export
@@ -576,8 +591,8 @@ add_dataset_samples <- function(
   )
   new_data <- merge_dataset_data(current$data, additions, include_bulk = include_bulk)
 
-  response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
+  invisible(dataset_id)
 }
 
 
@@ -599,8 +614,182 @@ remove_dataset_samples <- function(
   current <- get_dataset_detail(dataset_id, auth_token = auth_token)
   new_data <- remove_from_dataset_data(current$data, samples = samples, projects = projects)
 
-  response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
-  invisible(response)
+  update_dataset(dataset_id, list(data = new_data), auth_token = auth_token)
+  invisible(dataset_id)
+}
+
+
+#' Build the per-sample data frame for a dataset
+#'
+#' For each project in the dataset `$data` list, fetches the project's sample
+#' metadata with [get_project_samples()] and keeps only the samples that the dataset includes:
+#' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`,
+#' and for a merged project, all of the project's single-cell samples.
+#' Each modality flag is reported only as TRUE for samples that are both included in the dataset
+#' and actually have that modality available:
+#' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the
+#' sample is not included as single-cell)
+#' - `has_spatial` marks spatial inclusion, if requested, for the sample or project
+#' - `has_bulk` indicates that the sample is present in the bulk data table, if requested for a project.
+#' - `has_cite_seq` and `has_multiplexed` come from the sample records
+#'    and do not depend on the specific request
+#'
+#' @param data the project-keyed `$data` list from [get_dataset_detail()]
+#'
+#' @keywords internal
+#' @importFrom dplyr .data
+#'
+#' @returns a data frame with one row per included sample and columns
+#'   `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character: "cell",
+#'   "nucleus", or `NA`), `has_spatial`, `has_bulk`, `has_cite_seq`, and
+#'   `has_multiplexed` (all logical)
+make_dataset_data_df <- function(data) {
+  empty <- tibble::tibble(
+    scpca_sample_id = character(),
+    scpca_project_id = character(),
+    seq_unit = character(),
+    has_spatial = logical(),
+    has_bulk = logical(),
+    has_cite_seq = logical(),
+    has_multiplexed = logical()
+  )
+  if (length(data) == 0) {
+    return(empty)
+  }
+
+  result <- data |>
+    purrr::imap(\(project, project_id) {
+      merged <- identical(project$SINGLE_CELL, "MERGED")
+
+      # The project's sample metadata has the modality details we will need.
+      project_samples <- get_project_samples(project_id, simplify = FALSE)
+
+      # Get single cell samples for the project:
+      # - if merged from the project_samples metadata
+      # - if not merged, from the request list.
+      if (merged) {
+        single_cell_ids <- project_samples$scpca_sample_id[
+          project_samples$has_single_cell_data
+        ]
+      } else {
+        single_cell_ids <- as.character(project$SINGLE_CELL)
+      }
+
+      spatial_ids <- as.character(project$SPATIAL)
+      included_ids <- union(single_cell_ids, spatial_ids)
+      requested_bulk <- isTRUE(project$includes_bulk)
+
+      project_samples |>
+        # keep only the samples the dataset requests for this project
+        dplyr::filter(.data$scpca_sample_id %in% included_ids) |>
+        dplyr::mutate(
+          scpca_project_id = project_id,
+          # the single-cell sequencing unit (cell or nucleus), or NA when
+          # single-cell is not requested for the sample
+          seq_unit = purrr::map2_chr(
+            .data$seq_units,
+            .data$scpca_sample_id,
+            \(units, sample_id) {
+              if (!sample_id %in% single_cell_ids) {
+                return(NA_character_)
+              }
+              # get only the nucleus or cell (not spot or bulk)
+              # if both are present (unlikely), combine with a comma
+              intersect(c("cell", "nucleus"), as.character(units)) |>
+                paste(collapse = ",")
+            }
+          ),
+          # only modalities requested for the sample are reported; has_bulk also
+          # requires the sample to actually have bulk data
+          has_spatial = .data$scpca_sample_id %in% spatial_ids,
+          has_bulk = requested_bulk & .data$has_bulk_rna_seq,
+          has_cite_seq = .data$has_cite_seq_data,
+          has_multiplexed = .data$has_multiplexed_data
+        ) |>
+        dplyr::select(
+          "scpca_sample_id",
+          "scpca_project_id",
+          "seq_unit",
+          "has_spatial",
+          "has_bulk",
+          "has_cite_seq",
+          "has_multiplexed"
+        )
+    }) |>
+    purrr::list_rbind() |>
+    dplyr::arrange(.data$scpca_sample_id)
+
+  if (nrow(result) == 0) empty else result
+}
+
+
+#' Get a summary of a custom ScPCA dataset
+#'
+#' Fetches a custom dataset and returns a structured summary of its contents,
+#' including its processing status and a per-sample table describing the modality
+#' of each sample.
+#'
+#' For each project, the included samples and their modality details are looked
+#' up from the project's sample records (one request per project), so merged
+#' projects (whose individual sample IDs are not enumerated in the dataset
+#' record) are expanded to all of their single-cell samples. Projects whose
+#' single-cell data is merged are also listed in `merged_projects`.
+#'
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by this function).
+#' @param auth_token an authorization token from [get_auth()]. Defaults to the
+#'   `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically.
+#'
+#' @returns a named list with the following elements:
+#'   * `id`: the dataset UUID string
+#'   * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
+#'   * `status`: the processing status — one of "pending", "processing",
+#'     "succeeded", "failed", or "expired" (see [get_dataset_status()])
+#'   * `n_samples`: the total number of samples in the dataset, taken from the
+#'     API's `total_sample_count`
+#'   * `n_projects`: the number of projects in the dataset
+#'   * `sample_info`: a data frame with one row per included sample and the following columns:
+#'     - `scpca_sample_id`
+#'     - `scpca_project_id`
+#'     - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell)
+#'     - `has_spatial`
+#'     - `has_bulk`
+#'     - `has_cite_seq`
+#'     - `has_multiplexed`
+#'   * `merged_projects`: a character vector of project IDs whose single-cell
+#'     data is merged; `character(0)` when none
+#'
+#' @import httr2
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+#' info <- get_dataset_info(ds_id)
+#' info$status
+#' info$sample_info
+#' }
+get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) {
+  auth_token <- resolve_auth_token(auth_token)
+  detail <- get_dataset_detail(dataset, auth_token)
+
+  samples_df <- make_dataset_data_df(detail$data)
+  merged_projects <- detail$data |>
+    purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |>
+    names() |>
+    as.character()
+
+  list(
+    id = detail$id,
+    format = detail$format,
+    status = dataset_status_from_detail(detail),
+    # total_sample_count comes from the API and counts all samples in the dataset.
+    n_samples = detail$total_sample_count,
+    n_projects = length(detail$data),
+    sample_info = samples_df,
+    merged_projects = merged_projects
+  )
 }
 
 
diff --git a/R/downloads.R b/R/downloads.R
index 1a865b6..e7171a7 100644
--- a/R/downloads.R
+++ b/R/downloads.R
@@ -433,8 +433,9 @@ parse_download_file <- function(scpca_url) {
 #' from the dataset's download filename (which includes the dataset ID, format,
 #' and date).
 #'
-#' @param dataset the dataset UUID string, or a list with an `$id` element,
-#'   such as the return value of [create_dataset()].
+#' @param dataset the dataset UUID string (such as the value returned by
+#'   [create_dataset()]), or a list with an `$id` element (such as the value
+#'   returned by [get_dataset_detail()]).
 #' @param destination The path to the directory where the unzipped file directory
 #'   should be saved. Default is "scpca_data".
 #' @param overwrite Whether to overwrite files in existing directories if they
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 73870af..986e2a4 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -32,6 +32,7 @@ reference:
     contents:
       - create_dataset
       - get_dataset_status
+      - get_dataset_info
       - download_dataset
       - add_dataset_samples
       - replace_dataset_data
diff --git a/man/await_dataset_processing.Rd b/man/await_dataset_processing.Rd
index 1109953..599a465 100644
--- a/man/await_dataset_processing.Rd
+++ b/man/await_dataset_processing.Rd
@@ -13,8 +13,9 @@ await_dataset_processing(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{poll_interval}{Number of minutes to wait between status checks when
 \code{await_processing = TRUE}. Default is 0.5 (30 seconds).}
diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd
index 606c502..55f2344 100644
--- a/man/create_dataset.Rd
+++ b/man/create_dataset.Rd
@@ -31,19 +31,21 @@ spatial samples are always returned in Space Ranger format.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the API response as a list (invisibly), including the dataset \verb{$id}
+the dataset ID as a character string (invisibly)
 }
 \description{
 Creates a new user dataset without starting processing.
-The returned list includes the dataset \verb{$id} along with its current contents and status.
+Returns the new dataset's ID (invisibly), which you can pass to the other
+dataset functions such as \code{\link[=get_dataset_info]{get_dataset_info()}}, \code{\link[=add_dataset_samples]{add_dataset_samples()}}, and
+\code{\link[=start_dataset_processing]{start_dataset_processing()}}.
 }
 \examples{
 \dontrun{
 token <- get_auth("user@example.com", agree = TRUE)
-ds <- create_dataset(
+ds_id <- create_dataset(
   auth_token = token,
   samples = c("SCPCS000001", "SCPCS000002")
 )
-ds$id
+ds_id
 }
 }
diff --git a/man/dataset_status_from_detail.Rd b/man/dataset_status_from_detail.Rd
new file mode 100644
index 0000000..1c71ea0
--- /dev/null
+++ b/man/dataset_status_from_detail.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{dataset_status_from_detail}
+\alias{dataset_status_from_detail}
+\title{Map dataset detail status flags to a status string}
+\usage{
+dataset_status_from_detail(detail)
+}
+\arguments{
+\item{detail}{the dataset detail list returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}}
+}
+\value{
+a single character string: one of "pending", "processing",
+"succeeded", "failed", or "expired"
+}
+\description{
+Map dataset detail status flags to a status string
+}
+\keyword{internal}
diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd
index 39bb6cd..a996509 100644
--- a/man/download_dataset.Rd
+++ b/man/download_dataset.Rd
@@ -18,8 +18,9 @@ download_dataset(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{destination}{The path to the directory where the unzipped file directory
 should be saved. Default is "scpca_data".}
diff --git a/man/get_dataset_detail.Rd b/man/get_dataset_detail.Rd
index 2005b35..4f56d9d 100644
--- a/man/get_dataset_detail.Rd
+++ b/man/get_dataset_detail.Rd
@@ -7,8 +7,9 @@
 get_dataset_detail(dataset, auth_token)
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{auth_token}{an authorization token obtained from \code{\link[=get_auth]{get_auth()}};
 must match the token used to create the dataset.}
diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd
new file mode 100644
index 0000000..c032595
--- /dev/null
+++ b/man/get_dataset_info.Rd
@@ -0,0 +1,60 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{get_dataset_info}
+\alias{get_dataset_info}
+\title{Get a summary of a custom ScPCA dataset}
+\usage{
+get_dataset_info(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
+}
+\arguments{
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by this function).}
+
+\item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the
+\code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
+}
+\value{
+a named list with the following elements:
+\itemize{
+\item \code{id}: the dataset UUID string
+\item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA")
+\item \code{status}: the processing status — one of "pending", "processing",
+"succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}})
+\item \code{n_samples}: the total number of samples in the dataset, taken from the
+API's \code{total_sample_count}
+\item \code{n_projects}: the number of projects in the dataset
+\item \code{sample_info}: a data frame with one row per included sample and the following columns:
+\itemize{
+\item \code{scpca_sample_id}
+\item \code{scpca_project_id}
+\item \code{seq_unit} ("cell" or "nucleus", or \code{NA} if the sample is not included as single-cell)
+\item \code{has_spatial}
+\item \code{has_bulk}
+\item \code{has_cite_seq}
+\item \code{has_multiplexed}
+}
+\item \code{merged_projects}: a character vector of project IDs whose single-cell
+data is merged; \code{character(0)} when none
+}
+}
+\description{
+Fetches a custom dataset and returns a structured summary of its contents,
+including its processing status and a per-sample table describing the modality
+of each sample.
+}
+\details{
+For each project, the included samples and their modality details are looked
+up from the project's sample records (one request per project), so merged
+projects (whose individual sample IDs are not enumerated in the dataset
+record) are expanded to all of their single-cell samples. Projects whose
+single-cell data is merged are also listed in \code{merged_projects}.
+}
+\examples{
+\dontrun{
+ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+info <- get_dataset_info(ds_id)
+info$status
+info$sample_info
+}
+}
diff --git a/man/get_dataset_status.Rd b/man/get_dataset_status.Rd
index 4a56ef8..3f60353 100644
--- a/man/get_dataset_status.Rd
+++ b/man/get_dataset_status.Rd
@@ -7,8 +7,9 @@
 get_dataset_status(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
@@ -19,21 +20,21 @@ a single character string: one of "pending", "processing",
 }
 \description{
 Returns a single string describing where a dataset is in the processing
-lifecycle, by fetching the dataset detail and translating its status fields
-(\code{is_started}, \code{is_succeeded}, \code{is_failed}). A dataset that has been started
-but has neither succeeded nor failed is reported as "processing".
+lifecycle.
 }
 \details{
 Possible values are:
-\describe{
-\item{"pending"}{the dataset has not been started}
-\item{"processing"}{the dataset has been started but is not yet finished}
-\item{"succeeded"}{processing finished and the dataset is ready to download}
-\item{"failed"}{processing failed}
+\itemize{
+\item \code{"pending"}: the dataset has not been started
+\item \code{"processing"}: the dataset has been started but is not yet finished
+\item \code{"succeeded"}: processing finished and the dataset is ready to download
+\item \code{"expired"}: processing completed but the generated download has since
+expired and must be regenerated
+\item \code{"failed"}: processing failed
 }
 }
 \examples{
 \dontrun{
-get_dataset_status(ds)
+get_dataset_status(ds_id)
 }
 }
diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd
new file mode 100644
index 0000000..4c55760
--- /dev/null
+++ b/man/make_dataset_data_df.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{make_dataset_data_df}
+\alias{make_dataset_data_df}
+\title{Build the per-sample data frame for a dataset}
+\usage{
+make_dataset_data_df(data)
+}
+\arguments{
+\item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}}
+}
+\value{
+a data frame with one row per included sample and columns
+\code{scpca_sample_id}, \code{scpca_project_id}, \code{seq_unit} (character: "cell",
+"nucleus", or \code{NA}), \code{has_spatial}, \code{has_bulk}, \code{has_cite_seq}, and
+\code{has_multiplexed} (all logical)
+}
+\description{
+For each project in the dataset \verb{$data} list, fetches the project's sample
+metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples that the dataset includes:
+For a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL},
+and for a merged project, all of the project's single-cell samples.
+Each modality flag is reported only as TRUE for samples that are both included in the dataset
+and actually have that modality available:
+\itemize{
+\item \code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the
+sample is not included as single-cell)
+\item \code{has_spatial} marks spatial inclusion, if requested, for the sample or project
+\item \code{has_bulk} indicates that the sample is present in the bulk data table, if requested for a project.
+\item \code{has_cite_seq} and \code{has_multiplexed} come from the sample records
+and do not depend on the specific request
+}
+}
+\keyword{internal}
diff --git a/man/modify_dataset_samples.Rd b/man/modify_dataset_samples.Rd
index 61604ec..6cfc56d 100644
--- a/man/modify_dataset_samples.Rd
+++ b/man/modify_dataset_samples.Rd
@@ -36,7 +36,7 @@ projects keep their current value. Default is FALSE.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 \code{add_dataset_samples()} adds the given samples and/or all samples from the
diff --git a/man/replace_dataset_data.Rd b/man/replace_dataset_data.Rd
index 15afdf5..4b88c6e 100644
--- a/man/replace_dataset_data.Rd
+++ b/man/replace_dataset_data.Rd
@@ -26,7 +26,7 @@ all samples from each project are included.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Replaces the samples and/or projects in an existing dataset with a new
@@ -40,6 +40,6 @@ A dataset that has already started processing cannot be updated.
 }
 \examples{
 \dontrun{
-replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002"))
+replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002"))
 }
 }
diff --git a/man/resolve_dataset_id.Rd b/man/resolve_dataset_id.Rd
index ee7e54c..2588243 100644
--- a/man/resolve_dataset_id.Rd
+++ b/man/resolve_dataset_id.Rd
@@ -10,11 +10,12 @@ resolve_dataset_id(dataset)
 \item{dataset}{a dataset UUID string, or a list with an \verb{$id} element}
 }
 \value{
-the dataset ID as a length-1 character string
+the dataset ID as a character string
 }
 \description{
-Accepts either a dataset UUID string or a list with an \verb{$id} element (such as
-the return value of \code{\link[=create_dataset]{create_dataset()}} or \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns
-the ID string, after checking that it is a valid UUID.
+Accepts either a dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}) or a list with an \verb{$id} element (such as the value returned
+by \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns the ID string, after checking that it is
+a valid UUID.
 }
 \keyword{internal}
diff --git a/man/set_dataset_email.Rd b/man/set_dataset_email.Rd
index cc28f59..a245d01 100644
--- a/man/set_dataset_email.Rd
+++ b/man/set_dataset_email.Rd
@@ -15,7 +15,7 @@ set_dataset_email(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN"))
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Updates the email address the ScPCA Portal will use to notify you when the
@@ -27,6 +27,6 @@ A dataset that has already been started cannot be modified.
 }
 \examples{
 \dontrun{
-set_dataset_email(ds, email = "user@example.com")
+set_dataset_email(ds_id, email = "user@example.com")
 }
 }
diff --git a/man/start_dataset_processing.Rd b/man/start_dataset_processing.Rd
index b61909e..fd7a61b 100644
--- a/man/start_dataset_processing.Rd
+++ b/man/start_dataset_processing.Rd
@@ -11,8 +11,9 @@ start_dataset_processing(
 )
 }
 \arguments{
-\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element,
-such as the return value of \code{\link[=create_dataset]{create_dataset()}}.}
+\item{dataset}{the dataset UUID string (such as the value returned by
+\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value
+returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).}
 
 \item{email}{optional email address for the download notification. When
 supplied, it is set as part of the same request that starts processing.}
@@ -21,7 +22,7 @@ supplied, it is set as part of the same request that starts processing.}
 \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.}
 }
 \value{
-the updated dataset detail as a list (invisibly)
+the dataset ID as a character string (invisibly)
 }
 \description{
 Starts processing of an existing custom dataset so that its files can be
@@ -29,12 +30,18 @@ built for download, by sending a PUT request that sets \code{start = TRUE}.
 Optionally sets the notification email as part of the same request.
 }
 \details{
-Once processing has started a dataset is locked and can no longer be
-modified; attempting to modify or re-start it will raise an error.
+Before sending the request the current dataset status is checked via
+\code{\link[=get_dataset_status]{get_dataset_status()}}:
+\itemize{
+\item A \code{"pending"} or \code{"expired"} dataset is started normally.
+\item A \code{"failed"} dataset is retried with a warning.
+\item A \code{"processing"} or \code{"succeeded"} dataset is already underway or done;
+a message is emitted and no request is sent.
+}
 }
 \examples{
 \dontrun{
-ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
-start_dataset_processing(ds, email = "user@example.com")
+ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002"))
+start_dataset_processing(ds_id, email = "user@example.com")
 }
 }
diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R
index f0d9857..e3c6ef4 100644
--- a/tests/testthat/test-datasets.R
+++ b/tests/testthat/test-datasets.R
@@ -267,6 +267,7 @@ test_that("create_dataset errors when spatial format is requested", {
 })
 
 test_that("create_dataset POSTs with start = FALSE", {
+  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -278,8 +279,8 @@ test_that("create_dataset POSTs with start = FALSE", {
       )
     },
     req_perform = \(req, ...) {
-      body <- req$body$data
-      json_response(c(body, list(id = "new-dataset-uuid")))
+      captured_req <<- req
+      json_response(list(id = "new-dataset-uuid"))
     }
   )
 
@@ -290,10 +291,12 @@ test_that("create_dataset POSTs with start = FALSE", {
     },
     "new-dataset-uuid"
   )
-  expect_false(result$start)
+  expect_equal(httr2::req_get_method(captured_req), "POST")
+  expect_false(captured_req$body$data$start)
+  expect_equal(result, "new-dataset-uuid")
 })
 
-test_that("create_dataset returns response invisibly and messages with dataset id", {
+test_that("create_dataset returns the dataset id invisibly and messages with dataset id", {
   local_mocked_bindings(
     build_dataset_data = \(...) {
       list(
@@ -319,24 +322,24 @@ test_that("create_dataset returns response invisibly and messages with dataset i
     },
     "new-dataset-uuid"
   )
-  expect_equal(result$id, "new-dataset-uuid")
+  expect_equal(result, "new-dataset-uuid")
 })
 
 test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", {
   withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token")
+  captured_req <- NULL
   local_mocked_bindings(
     build_dataset_data = \(...) list(),
     req_perform = \(req, ...) {
-      json_response(list(
-        id = "new-dataset-uuid",
-        api_key = httr2::req_get_headers(req, "reveal")$`api-key`
-      ))
+      captured_req <<- req
+      json_response(list(id = "new-dataset-uuid"))
     }
   )
 
   # called without auth_token; the token should come from the environment
   result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce"))
-  expect_equal(result$api_key, "env-token")
+  expect_equal(httr2::req_get_headers(captured_req, "reveal")$`api-key`, "env-token")
+  expect_equal(result, "new-dataset-uuid")
 })
 
 # get_dataset_detail tests
@@ -549,6 +552,326 @@ test_that("get_dataset_status errors when auth_token is empty", {
 })
 
 
+# get_dataset_info tests
+
+test_that("get_dataset_info builds a per-sample table from project sample data", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        is_succeeded = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000002 = list(
+            SINGLE_CELL = list("SCPCS000003"),
+            SPATIAL = list("SCPCS000004"),
+            includes_bulk = TRUE
+          )
+        ),
+        total_sample_count = 4
+      )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      if (project_id == "SCPCP000001") {
+        # SCPCS000099 belongs to the project but is not in the dataset request
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000001", "SCPCS000002", "SCPCS000099"),
+          scpca_project_id = project_id,
+          has_single_cell_data = TRUE,
+          has_spatial_data = FALSE,
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell", "cell", "cell")
+        )
+      } else {
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000003", "SCPCS000004"),
+          scpca_project_id = project_id,
+          has_single_cell_data = c(TRUE, FALSE),
+          has_spatial_data = c(FALSE, TRUE),
+          has_bulk_rna_seq = c(TRUE, FALSE),
+          has_cite_seq_data = c(TRUE, FALSE),
+          has_multiplexed_data = c(FALSE, FALSE),
+          seq_units = list(c("cell", "bulk"), "spot")
+        )
+      }
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
+
+  expect_equal(info$id, DATASET_ID)
+  expect_equal(info$format, "SINGLE_CELL_EXPERIMENT")
+  expect_equal(info$status, "pending")
+  expect_equal(info$n_projects, 2)
+  expect_equal(info$n_samples, 4)
+  expect_equal(info$merged_projects, character(0))
+  expect_null(info$bulk_projects)
+  expect_s3_class(sample_info, "data.frame")
+  expect_setequal(
+    colnames(sample_info),
+    c(
+      "scpca_sample_id",
+      "scpca_project_id",
+      "seq_unit",
+      "has_spatial",
+      "has_bulk",
+      "has_cite_seq",
+      "has_multiplexed"
+    )
+  )
+  # one row per included sample; the unrequested SCPCS000099 is filtered out
+  expect_equal(nrow(sample_info), 4)
+  expect_false("SCPCS000099" %in% sample_info$scpca_sample_id)
+
+  field <- \(col, id) sample_info[[col]][sample_info$scpca_sample_id == id]
+  # seq_unit is the single-cell unit, or NA for a spatial-only sample
+  expect_equal(field("seq_unit", "SCPCS000001"), "cell")
+  expect_equal(field("seq_unit", "SCPCS000003"), "cell")
+  expect_true(is.na(field("seq_unit", "SCPCS000004")))
+
+  # only requested modalities are reported
+  expect_true(field("has_spatial", "SCPCS000004"))
+  expect_false(field("has_spatial", "SCPCS000001"))
+
+  expect_true(field("has_cite_seq", "SCPCS000003"))
+  expect_false(field("has_cite_seq", "SCPCS000001"))
+
+  # has_bulk reflects the request AND availability
+  expect_true(field("has_bulk", "SCPCS000003")) # requested + available
+  expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk
+  expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none
+  expect_false(any(sample_info$has_multiplexed))
+})
+
+test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list("SCPCS000001"),
+            includes_bulk = FALSE
+          )
+        ),
+        total_sample_count = 1
+      )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      tibble::tibble(
+        scpca_sample_id = "SCPCS000001",
+        scpca_project_id = project_id,
+        has_single_cell_data = TRUE,
+        has_spatial_data = TRUE,
+        has_bulk_rna_seq = FALSE,
+        has_cite_seq_data = FALSE,
+        has_multiplexed_data = FALSE,
+        seq_units = list(c("cell", "spot"))
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
+
+  # one row for the sample: single-cell unit plus spatial
+  expect_equal(nrow(sample_info), 1)
+  expect_equal(sample_info$seq_unit, "cell")
+  expect_true(sample_info$has_spatial)
+})
+
+test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "ANN_DATA",
+        is_started = FALSE,
+        data = list(),
+        total_sample_count = 0
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
+
+  expect_equal(info$n_samples, 0)
+  expect_equal(info$n_projects, 0)
+  expect_equal(nrow(sample_info), 0)
+  expect_null(info$bulk_projects)
+  expect_setequal(
+    colnames(sample_info),
+    c(
+      "scpca_sample_id",
+      "scpca_project_id",
+      "seq_unit",
+      "has_spatial",
+      "has_bulk",
+      "has_cite_seq",
+      "has_multiplexed"
+    )
+  )
+})
+
+test_that("get_dataset_info expands merged projects to all their single-cell samples", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000005 = list(
+            SINGLE_CELL = "MERGED",
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        ),
+        total_sample_count = 4
+      )
+    },
+    get_project_samples = \(project_id, simplify = TRUE) {
+      if (project_id == "SCPCP000001") {
+        tibble::tibble(
+          scpca_sample_id = "SCPCS000001",
+          scpca_project_id = project_id,
+          has_single_cell_data = TRUE,
+          has_spatial_data = FALSE,
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell")
+        )
+      } else {
+        # merged project: all single-cell samples are included; the
+        # non-single-cell SCPCS000053 is not
+        tibble::tibble(
+          scpca_sample_id = c("SCPCS000050", "SCPCS000051", "SCPCS000052", "SCPCS000053"),
+          scpca_project_id = project_id,
+          has_single_cell_data = c(TRUE, TRUE, TRUE, FALSE),
+          has_spatial_data = c(FALSE, FALSE, FALSE, TRUE),
+          has_bulk_rna_seq = FALSE,
+          has_cite_seq_data = FALSE,
+          has_multiplexed_data = FALSE,
+          seq_units = list("cell", "cell", "nucleus", "spot")
+        )
+      }
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
+
+  # merged project still surfaced in merged_projects
+  expect_equal(info$merged_projects, "SCPCP000005")
+  # its single-cell samples are expanded into the table; SCPCS000053 is excluded
+  expect_setequal(
+    sample_info$scpca_sample_id,
+    c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052")
+  )
+  expect_false("SCPCS000053" %in% sample_info$scpca_sample_id)
+  # the nucleus seq_unit is reported for that sample
+  expect_equal(
+    sample_info$seq_unit[sample_info$scpca_sample_id == "SCPCS000052"],
+    "nucleus"
+  )
+  expect_equal(info$n_projects, 2)
+  expect_equal(info$n_samples, 4)
+})
+
+test_that("get_dataset_info derives status from detail without a second API call", {
+  call_count <- 0
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      call_count <<- call_count + 1
+      list(
+        id = DATASET_ID,
+        format = "ANN_DATA",
+        is_started = TRUE,
+        is_succeeded = TRUE,
+        data = list(),
+        total_sample_count = 0
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+
+  expect_equal(call_count, 1)
+  expect_equal(info$status, "succeeded")
+})
+
+test_that("get_dataset_info prunes projects where nothing is requested", {
+  local_mocked_bindings(
+    get_dataset_detail = \(dataset, auth_token) {
+      list(
+        id = DATASET_ID,
+        format = "SINGLE_CELL_EXPERIMENT",
+        is_started = FALSE,
+        data = list(
+          SCPCP000001 = list(
+            SINGLE_CELL = list("SCPCS000001"),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          ),
+          SCPCP000002 = list(
+            SINGLE_CELL = list(),
+            SPATIAL = list(),
+            includes_bulk = FALSE
+          )
+        ),
+        total_sample_count = 1
+      )
+    },
+    # only SCPCP000001 should be queried; SCPCP000002 requests nothing
+    get_project_samples = \(project_id, simplify = TRUE) {
+      tibble::tibble(
+        scpca_sample_id = "SCPCS000001",
+        scpca_project_id = project_id,
+        has_single_cell_data = TRUE,
+        has_spatial_data = FALSE,
+        has_bulk_rna_seq = FALSE,
+        has_cite_seq_data = FALSE,
+        has_multiplexed_data = FALSE,
+        seq_units = list("cell")
+      )
+    }
+  )
+
+  info <- get_dataset_info(DATASET_ID, auth_token = "token")
+  sample_info <- info[["sample_info"]]
+
+  expect_equal(nrow(sample_info), 1)
+  expect_equal(sample_info$scpca_project_id, "SCPCP000001")
+  expect_false("SCPCP000002" %in% sample_info$scpca_project_id)
+})
+
+test_that("get_dataset_info errors when auth_token is empty", {
+  expect_error(
+    get_dataset_info(DATASET_ID, auth_token = ""),
+    "Authorization token must be provided"
+  )
+})
+
+
 test_that("get_ccdl_dataset_detail returns dataset fields including download_url", {
   with_mock_dir("ccdl_dataset_detail", {
     result <- get_ccdl_dataset_detail("abc123", auth_token = "test-token")
@@ -619,10 +942,11 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", {
     samples = "SCPCS000001"
   )
 
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_null(result$format)
-  expect_true("SCPCP000001" %in% names(result$data))
+  expect_null(captured_req$body$data$format)
+  expect_true("SCPCP000001" %in% names(captured_req$body$data$data))
+  expect_equal(result, DATASET_ID)
 })
 
 # set_dataset_email tests
@@ -641,9 +965,10 @@ test_that("set_dataset_email PUTs a new email", {
     auth_token = "token",
     email = "user@example.com"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_equal(result$email, "user@example.com")
+  expect_equal(captured_req$body$data$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("set_dataset_email errors when email is not a single string", {
@@ -694,10 +1019,11 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", {
     },
     "processing started"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
-  expect_true(result$start)
-  expect_null(result$email)
+  expect_true(captured_req$body$data$start)
+  expect_null(captured_req$body$data$email)
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("start_dataset_processing includes email in the same request when provided", {
@@ -717,9 +1043,10 @@ test_that("start_dataset_processing includes email in the same request when prov
       auth_token = "token"
     )
   )
-  expect_equal(captured_req$method, "PUT")
-  expect_true(result$start)
-  expect_equal(result$email, "user@example.com")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_true(captured_req$body$data$start)
+  expect_equal(captured_req$body$data$email, "user@example.com")
+  expect_equal(result, DATASET_ID)
 })
 
 test_that("start_dataset_processing errors when email is not a single string", {
@@ -754,7 +1081,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "is already processing"
   )
-  expect_null(result)
+  expect_equal(result, DATASET_ID)
   expect_false(put_called)
 })
 
@@ -772,7 +1099,7 @@ test_that("start_dataset_processing emits a message and sends no request when al
     result <- start_dataset_processing(DATASET_ID, auth_token = "token"),
     "has already completed processing"
   )
-  expect_null(result)
+  expect_equal(result, DATASET_ID)
   expect_false(put_called)
 })
 
@@ -792,7 +1119,7 @@ test_that("start_dataset_processing warns and retries when previously failed", {
     ),
     "previously failed to process"
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_true(captured_req$body$data$start)
 })
 
@@ -809,7 +1136,7 @@ test_that("start_dataset_processing restarts an expired dataset", {
   suppressMessages(
     start_dataset_processing(DATASET_ID, auth_token = "token")
   )
-  expect_equal(captured_req$method, "PUT")
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
   expect_true(captured_req$body$data$start)
 })
 
@@ -905,9 +1232,30 @@ test_that("remove_from_dataset_data drops whole projects", {
   expect_equal(names(result), "SCPCP000001")
 })
 
+test_that("remove_from_dataset_data errors when removing a sample from a merged project", {
+  existing <- list(
+    SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE)
+  )
+
+  expect_error(
+    remove_from_dataset_data(existing, samples = "SCPCS000001"),
+    "merged single-cell data"
+  )
+})
+
+test_that("remove_from_dataset_data can drop a merged project wholesale", {
+  existing <- list(
+    SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE),
+    SCPCP000002 = list(SINGLE_CELL = list("SCPCS000003"), SPATIAL = list(), includes_bulk = FALSE)
+  )
+
+  result <- remove_from_dataset_data(existing, projects = "SCPCP000001")
+  expect_equal(names(result), "SCPCP000002")
+})
+
 # add_dataset_samples / remove_dataset_samples tests
 
-test_that("add_dataset_samples merges new samples into existing data and PUTs", {
+test_that("add_dataset_samples PUTs the merged data", {
   captured_req <- NULL
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
@@ -922,12 +1270,18 @@ test_that("add_dataset_samples merges new samples into existing data and PUTs",
         )
       )
     },
+    # additions: one sample for the existing project, plus a brand-new project
     build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) {
       list(
         SCPCP000001 = list(
           SINGLE_CELL = list("SCPCS000002"),
           SPATIAL = list(),
           includes_bulk = include_bulk
+        ),
+        SCPCP000002 = list(
+          SINGLE_CELL = list("SCPCS000003"),
+          SPATIAL = list(),
+          includes_bulk = include_bulk
         )
       )
     },
@@ -940,16 +1294,27 @@ test_that("add_dataset_samples merges new samples into existing data and PUTs",
   result <- add_dataset_samples(
     DATASET_ID,
     auth_token = "token",
-    samples = "SCPCS000002"
+    samples = c("SCPCS000002", "SCPCS000003"),
+    include_bulk = TRUE
   )
-  expect_equal(captured_req$method, "PUT")
+
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
+
+  sent_data <- captured_req$body$data$data
+  expect_setequal(names(sent_data), c("SCPCP000001", "SCPCP000002"))
+  # existing project gains the new sample as a union of old and added IDs
   expect_setequal(
-    as.character(result$data$SCPCP000001$SINGLE_CELL),
+    as.character(sent_data$SCPCP000001$SINGLE_CELL),
     c("SCPCS000001", "SCPCS000002")
   )
+  # include_bulk applies to the newly added project but not the existing one
+  expect_false(sent_data$SCPCP000001$includes_bulk)
+  expect_true(sent_data$SCPCP000002$includes_bulk)
+  expect_equal(result, DATASET_ID)
 })
 
-test_that("remove_dataset_samples removes a project and PUTs", {
+test_that("remove_dataset_samples PUTs the reduced data", {
   captured_req <- NULL
   local_mocked_bindings(
     get_dataset_detail = \(dataset, auth_token) {
@@ -957,7 +1322,7 @@ test_that("remove_dataset_samples removes a project and PUTs", {
         id = DATASET_ID,
         data = list(
           SCPCP000001 = list(
-            SINGLE_CELL = list("SCPCS000001"),
+            SINGLE_CELL = list("SCPCS000001", "SCPCS000002"),
             SPATIAL = list(),
             includes_bulk = FALSE
           ),
@@ -978,8 +1343,16 @@ test_that("remove_dataset_samples removes a project and PUTs", {
   result <- remove_dataset_samples(
     DATASET_ID,
     auth_token = "token",
+    samples = "SCPCS000002",
     projects = "SCPCP000002"
   )
-  expect_equal(captured_req$method, "PUT")
-  expect_equal(names(result$data), "SCPCP000001")
+
+  expect_equal(httr2::req_get_method(captured_req), "PUT")
+  expect_match(captured_req$url, paste0("datasets/", DATASET_ID))
+
+  # SCPCP000002 dropped wholesale; SCPCP000001 keeps only the un-removed sample
+  sent_data <- captured_req$body$data$data
+  expect_equal(names(sent_data), "SCPCP000001")
+  expect_equal(as.character(sent_data$SCPCP000001$SINGLE_CELL), "SCPCS000001")
+  expect_equal(result, DATASET_ID)
 })
diff --git a/tests/testthat/test-projects.R b/tests/testthat/test-projects.R
index a4725d6..a1bb578 100644
--- a/tests/testthat/test-projects.R
+++ b/tests/testthat/test-projects.R
@@ -21,29 +21,29 @@ test_that("scpca_projects returns simplified data frame by default", {
     expect_s3_class(projects_df$created_at, "POSIXct")
     expect_s3_class(projects_df$updated_at, "POSIXct")
   })
+})
 
-  test_that("scpca_projects returns full data frame when simplify = FALSE", {
-    with_mock_dir("scpca_projects", {
-      projects_df <- scpca_projects(simplify = FALSE)
+test_that("scpca_projects returns full data frame when simplify = FALSE", {
+  with_mock_dir("scpca_projects", {
+    projects_df <- scpca_projects(simplify = FALSE)
 
-      # Check that it returns a data frame
-      expect_s3_class(projects_df, "data.frame")
+    # Check that it returns a data frame
+    expect_s3_class(projects_df, "data.frame")
 
-      # Check that we have rows and columns
-      expect_gt(nrow(projects_df), 0)
-      expect_gt(ncol(projects_df), 0)
+    # Check that we have rows and columns
+    expect_gt(nrow(projects_df), 0)
+    expect_gt(ncol(projects_df), 0)
 
-      # Check that list columns are present (not simplified)
-      list_columns <- sapply(projects_df, is.list)
-      expect_true(any(list_columns))
+    # Check that list columns are present (not simplified)
+    list_columns <- sapply(projects_df, is.list)
+    expect_true(any(list_columns))
 
-      # Check for expected key columns
-      expect_contains(colnames(projects_df), "scpca_project_id")
+    # Check for expected key columns
+    expect_contains(colnames(projects_df), "scpca_project_id")
 
-      # Check that date columns are properly converted
-      expect_s3_class(projects_df$created_at, "POSIXct")
-      expect_s3_class(projects_df$updated_at, "POSIXct")
-    })
+    # Check that date columns are properly converted
+    expect_s3_class(projects_df$created_at, "POSIXct")
+    expect_s3_class(projects_df$updated_at, "POSIXct")
   })
 })