From 3c584bbac0a6847e0e0a49205f270fcc00afe61f Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 22:21:33 -0400 Subject: [PATCH 01/19] Add separate function for dataset status internal --- R/datasets.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/R/datasets.R b/R/datasets.R index f4e24a8..9b9fcf6 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -218,6 +218,27 @@ get_dataset_detail <- function(dataset, auth_token) { resp_body_json() } +#' Map dataset detail status flags to a status string +#' +#' @param detail the dataset detail list returned by [get_dataset_detail()] +#' +#' @keywords internal +#' +#' @returns a single character string: one of "pending", "processing", +#' "succeeded", "failed", or "expired" +dataset_status_from_detail <- function(detail) { + if (isTRUE(detail$is_failed)) { + "failed" + } else if (isTRUE(detail$is_expired)) { + "expired" + } else if (isTRUE(detail$is_succeeded)) { + "succeeded" + } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) { + "processing" + } else { + "pending" + } +} #' Get the processing status of a custom dataset #' From ab1e3835f372014b1b476c00b3876d744b5ce90c Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 22:22:20 -0400 Subject: [PATCH 02/19] use new dataset status --- R/datasets.R | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 9b9fcf6..835621e 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -267,22 +267,12 @@ dataset_status_from_detail <- function(detail) { #' #' @examples #' \dontrun{ -#' get_dataset_status(ds) +#' get_dataset_status(ds_id) #' } get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { auth_token <- resolve_auth_token(auth_token) detail <- get_dataset_detail(dataset, auth_token) - if (isTRUE(detail$is_failed)) { - "failed" - } else if (isTRUE(detail$is_expired)) { - "expired" - } else if (isTRUE(detail$is_succeeded)) { - "succeeded" - } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) { - "processing" - } else { - "pending" - } + dataset_status_from_detail(detail) } From b0a136422b2fc525d83074112c60d23d97ddec81 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 22:28:42 -0400 Subject: [PATCH 03/19] Use only the dataset id for external return values. --- R/datasets.R | 78 +++++++++++++++++++++++++++------------------------ R/downloads.R | 5 ++-- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 835621e..fe35d3f 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -47,15 +47,16 @@ build_dataset_data <- function(samples = NULL, projects = NULL, include_bulk = F #' Resolve a dataset identifier to its ID string #' -#' Accepts either a dataset UUID string or a list with an `$id` element (such as -#' the return value of [create_dataset()] or [get_dataset_detail()]) and returns -#' the ID string, after checking that it is a valid UUID. +#' Accepts either a dataset UUID string (such as the value returned by +#' [create_dataset()]) or a list with an `$id` element (such as the value returned +#' by [get_dataset_detail()]) and returns the ID string, after checking that it is +#' a valid UUID. #' #' @param dataset a dataset UUID string, or a list with an `$id` element #' #' @keywords internal #' -#' @returns the dataset ID as a length-1 character string +#' @returns the dataset ID as a character string resolve_dataset_id <- function(dataset) { if (is.list(dataset)) { stopifnot("dataset must be an id string or contain an $id element" = !is.null(dataset$id)) @@ -112,7 +113,9 @@ update_dataset <- function(dataset_id, body, auth_token) { #' Create a custom dataset on the ScPCA Portal #' #' Creates a new user dataset without starting processing. -#' The returned list includes the dataset `$id` along with its current contents and status. +#' Returns the new dataset's ID (invisibly), which you can pass to the other +#' dataset functions such as [get_dataset_info()], [add_dataset_samples()], and +#' [start_dataset_processing()]. #' #' @param samples optional character vector of ScPCA sample IDs (e.g. "SCPCS000001") #' @param projects optional character vector of ScPCA project IDs (e.g. "SCPCP000001"); @@ -125,7 +128,7 @@ update_dataset <- function(dataset_id, body, auth_token) { #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the API response as a list (invisibly), including the dataset `$id` +#' @returns the dataset ID as a character string (invisibly) #' #' @import httr2 #' @export @@ -133,11 +136,11 @@ update_dataset <- function(dataset_id, body, auth_token) { #' @examples #' \dontrun{ #' token <- get_auth("user@example.com", agree = TRUE) -#' ds <- create_dataset( +#' ds_id <- create_dataset( #' auth_token = token, #' samples = c("SCPCS000001", "SCPCS000002") #' ) -#' ds$id +#' ds_id #' } create_dataset <- function( samples = NULL, @@ -179,7 +182,7 @@ create_dataset <- function( resp_body_json() message(glue::glue("ScPCA dataset {response$id} created.")) - invisible(response) + invisible(response$id) } @@ -193,8 +196,9 @@ create_dataset <- function( #' it is also used by the dataset modification functions to fetch current #' contents before updating. #' -#' @param dataset the dataset UUID string, or a list with an `$id` element -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param auth_token an authorization token obtained from [get_auth()]; #' must match the token used to create the dataset. #' @@ -254,8 +258,9 @@ dataset_status_from_detail <- function(detail) { #' expired and must be regenerated #' * `"failed"`: processing failed #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' @@ -294,13 +299,13 @@ get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKE #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @export #' #' @examples #' \dontrun{ -#' replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002")) +#' replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002")) #' } replace_dataset_data <- function( dataset, @@ -319,8 +324,8 @@ replace_dataset_data <- function( data <- build_dataset_data(samples = samples, projects = projects, include_bulk = include_bulk) - response <- update_dataset(dataset_id, list(data = data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = data), auth_token = auth_token) + invisible(dataset_id) } @@ -337,13 +342,13 @@ replace_dataset_data <- function( #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @export #' #' @examples #' \dontrun{ -#' set_dataset_email(ds, email = "user@example.com") +#' set_dataset_email(ds_id, email = "user@example.com") #' } set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { auth_token <- resolve_auth_token(auth_token) @@ -354,8 +359,8 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT ) dataset_id <- resolve_dataset_id(dataset) - response <- update_dataset(dataset_id, list(email = email), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(email = email), auth_token = auth_token) + invisible(dataset_id) } @@ -373,24 +378,23 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT #' * A `"processing"` or `"succeeded"` dataset is already underway or done; #' a message is emitted and no request is sent. #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param email optional email address for the download notification. When #' supplied, it is set as part of the same request that starts processing. #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) when a request is -#' sent, or `NULL` (invisibly) when the dataset is already processing or -#' completed. +#' @returns the dataset ID as a character string (invisibly) #' #' @import httr2 #' @export #' #' @examples #' \dontrun{ -#' ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) -#' start_dataset_processing(ds, email = "user@example.com") +#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +#' start_dataset_processing(ds_id, email = "user@example.com") #' } start_dataset_processing <- function( dataset, @@ -410,11 +414,11 @@ start_dataset_processing <- function( status <- get_dataset_status(dataset_id, auth_token = auth_token) if (status == "processing") { message(glue::glue("ScPCA dataset {dataset_id} is already processing.")) - return(invisible(NULL)) + return(invisible(dataset_id)) } if (status == "succeeded") { message(glue::glue("ScPCA dataset {dataset_id} has already completed processing.")) - return(invisible(NULL)) + return(invisible(dataset_id)) } if (status == "failed") { warning( @@ -428,9 +432,9 @@ start_dataset_processing <- function( body$email <- email } - response <- update_dataset(dataset_id, body, auth_token = auth_token) + update_dataset(dataset_id, body, auth_token = auth_token) message(glue::glue("ScPCA dataset {dataset_id} processing started.")) - invisible(response) + invisible(dataset_id) } @@ -549,7 +553,7 @@ remove_from_dataset_data <- function(existing, samples = NULL, projects = NULL) #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @rdname modify_dataset_samples #' @export @@ -585,8 +589,8 @@ add_dataset_samples <- function( ) new_data <- merge_dataset_data(current$data, additions, include_bulk = include_bulk) - response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) + invisible(dataset_id) } @@ -608,8 +612,8 @@ remove_dataset_samples <- function( current <- get_dataset_detail(dataset_id, auth_token = auth_token) new_data <- remove_from_dataset_data(current$data, samples = samples, projects = projects) - response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) + invisible(dataset_id) } diff --git a/R/downloads.R b/R/downloads.R index 1a865b6..e7171a7 100644 --- a/R/downloads.R +++ b/R/downloads.R @@ -433,8 +433,9 @@ parse_download_file <- function(scpca_url) { #' from the dataset's download filename (which includes the dataset ID, format, #' and date). #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param destination The path to the directory where the unzipped file directory #' should be saved. Default is "scpca_data". #' @param overwrite Whether to overwrite files in existing directories if they From 1e8735f66c90c5f847dd2899f43a31db1d0ca884 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 22:33:46 -0400 Subject: [PATCH 04/19] Add get_dataset_info function (and data frame helper) --- R/datasets.R | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/R/datasets.R b/R/datasets.R index fe35d3f..7ce673f 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -617,6 +617,125 @@ remove_dataset_samples <- function( } +#' Make a per-sample data frame from the `$data` list +#' +#' Transforms the project-keyed `$data` list from [get_dataset_detail()] into a +#' one-row-per-sample data frame. Projects with merged single-cell data +#' (`SINGLE_CELL = "MERGED"`) are excluded. +#' +#' @param data the project-keyed `$data` list from [get_dataset_detail()] +#' +#' @keywords internal +#' @importFrom dplyr .data +#' +#' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`, +#' `modality`, and `includes_bulk` +make_dataset_data_df <- function(data) { + empty <- data.frame( + scpca_sample_id = character(), + scpca_project_id = character(), + modality = character(), + includes_bulk = logical() + ) + if (length(data) == 0) { + return(empty) + } + + result <- data |> + purrr::imap(\(project, project_id) { + includes_bulk <- isTRUE(project$includes_bulk) + single_cell_ids <- project$SINGLE_CELL + # Datasets created outside this package may be merged. + # projects are excluded here and surfaced via `merged_projects` in + # get_dataset_info() instead. + if (identical(single_cell_ids, "MERGED")) { + return(NULL) + } + sc_ids <- as.character(single_cell_ids) + sp_ids <- as.character(project$SPATIAL) + if (length(sc_ids) == 0 && length(sp_ids) == 0) { + return(NULL) + } + + data.frame( + scpca_sample_id = c(sc_ids, sp_ids), + scpca_project_id = project_id, + modality = rep( + c("single-cell", "spatial"), + times = c(length(sc_ids), length(sp_ids)) + ), + includes_bulk = includes_bulk + ) + }) |> + purrr::list_rbind() |> + dplyr::arrange(.data$scpca_sample_id) + + if (nrow(result) == 0) empty else result +} + + +#' Get a summary of a custom ScPCA dataset +#' +#' Fetches a custom dataset and returns a structured summary of its contents, +#' including its processing status and a per-sample table describing the modality for +#' each sample. +#' +#' Projects with merged single-cell data (where individual sample IDs are not +#' enumerated in the dataset record) are excluded from `samples` and listed in +#' `merged_projects` instead. +#' +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by this function). +#' @param auth_token an authorization token from [get_auth()]. Defaults to the +#' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. +#' +#' @returns a named list with the following elements: +#' * `id`: the dataset UUID string +#' * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") +#' * `status`: the processing status — one of "pending", "processing", +#' "succeeded", "failed", or "expired" (see [get_dataset_status()]) +#' * `n_samples`: the number of rows in `samples` (one per sample-modality +#' combination; merged-single-cell projects are not counted) +#' * `n_projects`: the number of projects in the dataset +#' * `samples`: a data frame with one row per sample-modality combination and +#' columns `scpca_sample_id`, `scpca_project_id`, `modality` (character: +#' "single-cell" or "spatial"), and `includes_bulk` (logical) +#' * `merged_projects`: a character vector of project IDs whose single-cell +#' data is merged; `character(0)` when none +#' +#' @import httr2 +#' @export +#' +#' @examples +#' \dontrun{ +#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +#' info <- get_dataset_info(ds_id) +#' info$status +#' info$samples +#' } +get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { + auth_token <- resolve_auth_token(auth_token) + detail <- get_dataset_detail(dataset, auth_token) + + samples <- make_dataset_data_df(detail$data) + merged_projects <- detail$data |> + purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |> + names() |> + as.character() + + list( + id = detail$id, + format = detail$format, + status = dataset_status_from_detail(detail), + n_samples = nrow(samples), + n_projects = length(detail$data), + samples = samples, + merged_projects = merged_projects + ) +} + + #' Get CCDL dataset objects from the ScPCA API #' #' @param project_id Optional ScPCA project ID to filter by (e.g. "SCPCP000001") From b13835a3104129c0f1812dc01bdd6dbd41970370 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 22:47:30 -0400 Subject: [PATCH 05/19] Simplify and consolidate tests --- tests/testthat/test-datasets.R | 499 +++++++++++++-------------------- 1 file changed, 199 insertions(+), 300 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index c46e63b..073d21e 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -32,115 +32,6 @@ test_that("get_ccdl_datasets combines results across pages", { }) }) -test_that("get_ccdl_datasets passes project_id as ccdl_project_id query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(project_id = "SCPCP000001") - expect_match(captured_req$url, "ccdl_project_id=SCPCP000001") -}) - -test_that("get_ccdl_datasets passes modality as ccdl_modality query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(modality = "SINGLE_CELL") - expect_match(captured_req$url, "ccdl_modality=SINGLE_CELL") -}) - -test_that("get_ccdl_datasets passes format as format query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(format = "ANN_DATA") - expect_match(captured_req$url, "format=ANN_DATA") -}) - -test_that("get_ccdl_datasets passes merged as ccdl_is_merged query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(merged = TRUE) - expect_match(captured_req$url, "ccdl_is_merged=TRUE") -}) - -test_that("get_ccdl_datasets passes include_multiplexed as includes_files_multiplexed query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(include_multiplexed = TRUE) - expect_match(captured_req$url, "includes_files_multiplexed=TRUE") - - get_ccdl_datasets(include_multiplexed = FALSE) - expect_match(captured_req$url, "includes_files_multiplexed=FALSE") -}) - -test_that("get_ccdl_datasets passes metadata_only as ccdl_name=ALL_METADATA query parameter", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(metadata_only = TRUE) - expect_match(captured_req$url, "ccdl_name=ALL_METADATA") -}) - -test_that("get_ccdl_datasets includes api-key header when auth_token is provided", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets(auth_token = "test-token-abc") - expect_equal( - httr2::req_get_headers(captured_req, "reveal")$`api-key`, - "test-token-abc" - ) -}) - -test_that("get_ccdl_datasets does not include api-key header when auth_token is empty", { - captured_req <- NULL - local_mocked_bindings( - req_perform_iterative = function(req, ...) { - captured_req <<- req - list() - } - ) - - get_ccdl_datasets() - expect_null(httr2::req_get_headers(captured_req, "reveal")$`api-key`) -}) # build_dataset_data tests @@ -267,6 +158,7 @@ test_that("create_dataset errors when spatial format is requested", { }) test_that("create_dataset POSTs with start = FALSE", { + captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) { list( @@ -278,8 +170,8 @@ test_that("create_dataset POSTs with start = FALSE", { ) }, req_perform = \(req, ...) { - body <- req$body$data - json_response(c(body, list(id = "new-dataset-uuid"))) + captured_req <<- req + json_response(c(req$body$data, list(id = "new-dataset-uuid"))) } ) @@ -290,10 +182,11 @@ test_that("create_dataset POSTs with start = FALSE", { }, "new-dataset-uuid" ) - expect_false(result$start) + expect_false(captured_req$body$data$start) + expect_equal(result, "new-dataset-uuid") }) -test_that("create_dataset returns response invisibly and messages with dataset id", { +test_that("create_dataset returns id invisibly and messages with dataset id", { local_mocked_bindings( build_dataset_data = \(...) { list( @@ -319,85 +212,27 @@ test_that("create_dataset returns response invisibly and messages with dataset i }, "new-dataset-uuid" ) - expect_equal(result$id, "new-dataset-uuid") + expect_equal(result, "new-dataset-uuid") }) test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", { withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token") + captured_key <- NULL local_mocked_bindings( build_dataset_data = \(...) list(), req_perform = \(req, ...) { - json_response(list( - id = "new-dataset-uuid", - api_key = httr2::req_get_headers(req, "reveal")$`api-key` - )) + captured_key <<- httr2::req_get_headers(req, "reveal")$`api-key` + json_response(list(id = "new-dataset-uuid")) } ) # called without auth_token; the token should come from the environment - result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) - expect_equal(result$api_key, "env-token") + suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) + expect_equal(captured_key, "env-token") }) # get_dataset_detail tests -test_that("get_dataset_detail returns dataset with data and status fields", { - local_mocked_bindings( - req_perform = \(req, ...) { - json_response(list( - id = DATASET_ID, - format = "SINGLE_CELL_EXPERIMENT", - data = list( - SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), - SPATIAL = list(), - includes_bulk = FALSE - ) - ), - is_started = FALSE, - is_succeeded = FALSE, - total_sample_count = 2, - computed_file = NULL - )) - } - ) - - result <- get_dataset_detail(DATASET_ID, auth_token = "test-token") - - expect_type(result, "list") - expect_equal(result$id, DATASET_ID) - expect_equal(result$format, "SINGLE_CELL_EXPERIMENT") - expect_false(result$is_started) - expect_false(result$is_succeeded) -}) - -test_that("get_dataset_detail returns data field with project and sample structure", { - local_mocked_bindings( - req_perform = \(req, ...) { - json_response(list( - id = DATASET_ID, - format = "SINGLE_CELL_EXPERIMENT", - data = list( - SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), - SPATIAL = list(), - includes_bulk = FALSE - ) - ) - )) - } - ) - - result <- get_dataset_detail(DATASET_ID, auth_token = "test-token") - - expect_type(result$data, "list") - expect_true("SCPCP000001" %in% names(result$data)) - expect_contains( - result$data$SCPCP000001$SINGLE_CELL, - c("SCPCS000001", "SCPCS000002") - ) -}) - test_that("get_dataset_detail includes api-key header when auth_token is provided", { local_mocked_bindings( req_perform = \(req, ...) { @@ -450,32 +285,6 @@ test_that("get_ccdl_datasets handles 403 errors with an authorization message", ) }) -test_that("get_dataset_detail accepts a list with $id in place of a string", { - local_mocked_bindings( - req_perform = \(req, ...) { - json_response(list(id = DATASET_ID, data = list())) - } - ) - - dataset_list <- list(id = DATASET_ID, data = list()) - result <- get_dataset_detail(dataset_list, auth_token = "test-token") - expect_equal(result$id, DATASET_ID) -}) - -test_that("get_dataset_detail errors when list has no $id element", { - expect_error( - get_dataset_detail(list(data = list()), auth_token = "test-token"), - "dataset must be an id string or contain an \\$id element" - ) -}) - -test_that("get_dataset_detail errors when dataset is not a string or list", { - expect_error( - get_dataset_detail(123, auth_token = "test-token"), - "dataset must be an id string or contain an \\$id element" - ) -}) - # get_dataset_status tests test_that("get_dataset_status maps detail status fields to a status string", { @@ -549,6 +358,172 @@ test_that("get_dataset_status errors when auth_token is empty", { }) +# get_dataset_info tests + +test_that("get_dataset_info returns structured summary with samples data frame", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + is_succeeded = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000002 = list( + SINGLE_CELL = list("SCPCS000003"), + SPATIAL = list("SCPCS000003"), + includes_bulk = TRUE + ) + ) + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + expect_equal(info$id, DATASET_ID) + expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") + expect_equal(info$status, "pending") + expect_equal(info$n_projects, 2) + # SCPCS000003 appears in both modalities: 2 SC rows + 1 spatial = 4 rows total + expect_equal(info$n_samples, 4) + expect_equal(info$merged_projects, character(0)) + expect_s3_class(info$samples, "data.frame") + expect_setequal( + colnames(info$samples), + c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk") + ) + # SCPCS000003 has two rows — one per modality + rows_003 <- info$samples[info$samples$scpca_sample_id == "SCPCS000003", ] + expect_setequal(rows_003$modality, c("single-cell", "spatial")) + # SCPCP000001 samples should not have includes_bulk + rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ] + expect_false(all(rows_p1$includes_bulk)) + # SCPCP000002 samples should have includes_bulk + rows_p2 <- info$samples[info$samples$scpca_project_id == "SCPCP000002", ] + expect_true(all(rows_p2$includes_bulk)) +}) + +test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "ANN_DATA", + is_started = FALSE, + data = list() + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + expect_equal(info$n_samples, 0) + expect_equal(info$n_projects, 0) + expect_equal(nrow(info$samples), 0) + expect_setequal( + colnames(info$samples), + c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk") + ) +}) + +test_that("get_dataset_info surfaces merged projects separately and excludes them from samples", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000005 = list( + SINGLE_CELL = "MERGED", + SPATIAL = list(), + includes_bulk = FALSE + ) + ) + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + # merged project excluded from samples and n_samples + expect_equal(info$n_samples, 1) + expect_equal(info$samples$scpca_sample_id, "SCPCS000001") + # but counted in n_projects and surfaced in merged_projects + expect_equal(info$n_projects, 2) + expect_equal(info$merged_projects, "SCPCP000005") +}) + +test_that("get_dataset_info derives status from detail without a second API call", { + call_count <- 0 + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + call_count <<- call_count + 1 + list( + id = DATASET_ID, + format = "ANN_DATA", + is_started = TRUE, + is_succeeded = TRUE, + data = list() + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + expect_equal(call_count, 1) + expect_equal(info$status, "succeeded") +}) + +test_that("get_dataset_info prunes projects where both modality lists are empty", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000002 = list( + SINGLE_CELL = list(), + SPATIAL = list(), + includes_bulk = FALSE + ) + ) + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + expect_equal(info$n_samples, 1) + expect_equal(info$samples$scpca_project_id, "SCPCP000001") + expect_false("SCPCP000002" %in% info$samples$scpca_project_id) +}) + +test_that("get_dataset_info errors when auth_token is empty", { + expect_error( + get_dataset_info(DATASET_ID, auth_token = ""), + "Authorization token must be provided" + ) +}) + + test_that("get_ccdl_dataset_detail returns dataset fields including download_url", { with_mock_dir("ccdl_dataset_detail", { result <- get_ccdl_dataset_detail("abc123", auth_token = "test-token") @@ -609,7 +584,7 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { }, req_perform = \(req, ...) { captured_req <<- req - json_response(req$body$data) + json_response(list(id = DATASET_ID, data = req$body$data)) } ) @@ -621,8 +596,8 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_null(result$format) - expect_true("SCPCP000001" %in% names(result$data)) + expect_null(captured_req$body$data$format) + expect_equal(result, DATASET_ID) }) # set_dataset_email tests @@ -632,7 +607,7 @@ test_that("set_dataset_email PUTs a new email", { local_mocked_bindings( req_perform = \(req, ...) { captured_req <<- req - json_response(req$body$data) + json_response(list(id = DATASET_ID, email = req$body$data$email)) } ) @@ -643,7 +618,8 @@ test_that("set_dataset_email PUTs a new email", { ) expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_equal(result$email, "user@example.com") + expect_equal(captured_req$body$data$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("set_dataset_email errors when email is not a single string", { @@ -680,7 +656,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { get_dataset_status = \(dataset, auth_token) "pending", req_perform = \(req, ...) { captured_req <<- req - json_response(req$body$data) + json_response(list(id = DATASET_ID)) } ) @@ -696,30 +672,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { ) expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_true(result$start) - expect_null(result$email) -}) - -test_that("start_dataset_processing includes email in the same request when provided", { - captured_req <- NULL - local_mocked_bindings( - get_dataset_status = \(dataset, auth_token) "pending", - req_perform = \(req, ...) { - captured_req <<- req - json_response(req$body$data) - } - ) - - result <- suppressMessages( - start_dataset_processing( - DATASET_ID, - email = "user@example.com", - auth_token = "token" - ) - ) - expect_equal(captured_req$method, "PUT") - expect_true(result$start) - expect_equal(result$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("start_dataset_processing errors when email is not a single string", { @@ -754,7 +707,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "is already processing" ) - expect_null(result) + expect_equal(result, DATASET_ID) expect_false(put_called) }) @@ -772,7 +725,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "has already completed processing" ) - expect_null(result) + expect_equal(result, DATASET_ID) expect_false(put_called) }) @@ -793,7 +746,6 @@ test_that("start_dataset_processing warns and retries when previously failed", { "previously failed to process" ) expect_equal(captured_req$method, "PUT") - expect_true(captured_req$body$data$start) }) test_that("start_dataset_processing restarts an expired dataset", { @@ -810,7 +762,6 @@ test_that("start_dataset_processing restarts an expired dataset", { start_dataset_processing(DATASET_ID, auth_token = "token") ) expect_equal(captured_req$method, "PUT") - expect_true(captured_req$body$data$start) }) test_that("start_dataset_processing surfaces a locked-dataset error on a 409 race", { @@ -907,79 +858,27 @@ test_that("remove_from_dataset_data drops whole projects", { # add_dataset_samples / remove_dataset_samples tests -test_that("add_dataset_samples merges new samples into existing data and PUTs", { - captured_req <- NULL +test_that("add_dataset_samples PUTs", { local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { - list( - id = DATASET_ID, - data = list( - SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000001"), - SPATIAL = list(), - includes_bulk = FALSE - ) - ) - ) + list(id = DATASET_ID, data = list()) }, - build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) { - list( - SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000002"), - SPATIAL = list(), - includes_bulk = include_bulk - ) - ) - }, - req_perform = \(req, ...) { - captured_req <<- req - json_response(req$body$data) - } + build_dataset_data = \(...) list(), + req_perform = \(req, ...) json_response(list(id = DATASET_ID)) ) - result <- add_dataset_samples( - DATASET_ID, - auth_token = "token", - samples = "SCPCS000002" - ) - expect_equal(captured_req$method, "PUT") - expect_setequal( - as.character(result$data$SCPCP000001$SINGLE_CELL), - c("SCPCS000001", "SCPCS000002") - ) + result <- add_dataset_samples(DATASET_ID, auth_token = "token", samples = "SCPCS000002") + expect_equal(result, DATASET_ID) }) -test_that("remove_dataset_samples removes a project and PUTs", { - captured_req <- NULL +test_that("remove_dataset_samples PUTs", { local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { - list( - id = DATASET_ID, - data = list( - SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000001"), - SPATIAL = list(), - includes_bulk = FALSE - ), - SCPCP000002 = list( - SINGLE_CELL = list("SCPCS000003"), - SPATIAL = list(), - includes_bulk = FALSE - ) - ) - ) + list(id = DATASET_ID, data = list()) }, - req_perform = \(req, ...) { - captured_req <<- req - json_response(req$body$data) - } + req_perform = \(req, ...) json_response(list(id = DATASET_ID)) ) - result <- remove_dataset_samples( - DATASET_ID, - auth_token = "token", - projects = "SCPCP000002" - ) - expect_equal(captured_req$method, "PUT") - expect_equal(names(result$data), "SCPCP000001") + result <- remove_dataset_samples(DATASET_ID, auth_token = "token", projects = "SCPCP000002") + expect_equal(result, DATASET_ID) }) From afdb1df7d005f3ec899e87b3331db6a2d72fc34d Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 23:07:54 -0400 Subject: [PATCH 06/19] fix testing indentation error --- tests/testthat/test-projects.R | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/testthat/test-projects.R b/tests/testthat/test-projects.R index a4725d6..a1bb578 100644 --- a/tests/testthat/test-projects.R +++ b/tests/testthat/test-projects.R @@ -21,29 +21,29 @@ test_that("scpca_projects returns simplified data frame by default", { expect_s3_class(projects_df$created_at, "POSIXct") expect_s3_class(projects_df$updated_at, "POSIXct") }) +}) - test_that("scpca_projects returns full data frame when simplify = FALSE", { - with_mock_dir("scpca_projects", { - projects_df <- scpca_projects(simplify = FALSE) +test_that("scpca_projects returns full data frame when simplify = FALSE", { + with_mock_dir("scpca_projects", { + projects_df <- scpca_projects(simplify = FALSE) - # Check that it returns a data frame - expect_s3_class(projects_df, "data.frame") + # Check that it returns a data frame + expect_s3_class(projects_df, "data.frame") - # Check that we have rows and columns - expect_gt(nrow(projects_df), 0) - expect_gt(ncol(projects_df), 0) + # Check that we have rows and columns + expect_gt(nrow(projects_df), 0) + expect_gt(ncol(projects_df), 0) - # Check that list columns are present (not simplified) - list_columns <- sapply(projects_df, is.list) - expect_true(any(list_columns)) + # Check that list columns are present (not simplified) + list_columns <- sapply(projects_df, is.list) + expect_true(any(list_columns)) - # Check for expected key columns - expect_contains(colnames(projects_df), "scpca_project_id") + # Check for expected key columns + expect_contains(colnames(projects_df), "scpca_project_id") - # Check that date columns are properly converted - expect_s3_class(projects_df$created_at, "POSIXct") - expect_s3_class(projects_df$updated_at, "POSIXct") - }) + # Check that date columns are properly converted + expect_s3_class(projects_df$created_at, "POSIXct") + expect_s3_class(projects_df$updated_at, "POSIXct") }) }) From abb682dd107cb43ccd1bd5129c806a7711f98cd6 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Wed, 3 Jun 2026 23:09:30 -0400 Subject: [PATCH 07/19] document --- NAMESPACE | 1 + _pkgdown.yml | 1 + man/await_dataset_processing.Rd | 5 +-- man/create_dataset.Rd | 10 +++--- man/dataset_status_from_detail.Rd | 19 ++++++++++++ man/download_dataset.Rd | 5 +-- man/get_dataset_detail.Rd | 5 +-- man/get_dataset_info.Rd | 51 +++++++++++++++++++++++++++++++ man/get_dataset_status.Rd | 23 +++++++------- man/make_dataset_data_df.Rd | 21 +++++++++++++ man/modify_dataset_samples.Rd | 2 +- man/replace_dataset_data.Rd | 4 +-- man/resolve_dataset_id.Rd | 9 +++--- man/set_dataset_email.Rd | 4 +-- man/start_dataset_processing.Rd | 21 ++++++++----- 15 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 man/dataset_status_from_detail.Rd create mode 100644 man/get_dataset_info.Rd create mode 100644 man/make_dataset_data_df.Rd diff --git a/NAMESPACE b/NAMESPACE index 28593d4..66eb4ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(download_dataset) export(download_project) export(download_sample) export(get_auth) +export(get_dataset_info) export(get_dataset_status) export(get_project_info) export(get_project_libraries) diff --git a/_pkgdown.yml b/_pkgdown.yml index 73870af..986e2a4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,7 @@ reference: contents: - create_dataset - get_dataset_status + - get_dataset_info - download_dataset - add_dataset_samples - replace_dataset_data diff --git a/man/await_dataset_processing.Rd b/man/await_dataset_processing.Rd index 1109953..599a465 100644 --- a/man/await_dataset_processing.Rd +++ b/man/await_dataset_processing.Rd @@ -13,8 +13,9 @@ await_dataset_processing( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{poll_interval}{Number of minutes to wait between status checks when \code{await_processing = TRUE}. Default is 0.5 (30 seconds).} diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index 606c502..55f2344 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -31,19 +31,21 @@ spatial samples are always returned in Space Ranger format.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the API response as a list (invisibly), including the dataset \verb{$id} +the dataset ID as a character string (invisibly) } \description{ Creates a new user dataset without starting processing. -The returned list includes the dataset \verb{$id} along with its current contents and status. +Returns the new dataset's ID (invisibly), which you can pass to the other +dataset functions such as \code{\link[=get_dataset_info]{get_dataset_info()}}, \code{\link[=add_dataset_samples]{add_dataset_samples()}}, and +\code{\link[=start_dataset_processing]{start_dataset_processing()}}. } \examples{ \dontrun{ token <- get_auth("user@example.com", agree = TRUE) -ds <- create_dataset( +ds_id <- create_dataset( auth_token = token, samples = c("SCPCS000001", "SCPCS000002") ) -ds$id +ds_id } } diff --git a/man/dataset_status_from_detail.Rd b/man/dataset_status_from_detail.Rd new file mode 100644 index 0000000..1c71ea0 --- /dev/null +++ b/man/dataset_status_from_detail.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{dataset_status_from_detail} +\alias{dataset_status_from_detail} +\title{Map dataset detail status flags to a status string} +\usage{ +dataset_status_from_detail(detail) +} +\arguments{ +\item{detail}{the dataset detail list returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}} +} +\value{ +a single character string: one of "pending", "processing", +"succeeded", "failed", or "expired" +} +\description{ +Map dataset detail status flags to a status string +} +\keyword{internal} diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd index 39bb6cd..a996509 100644 --- a/man/download_dataset.Rd +++ b/man/download_dataset.Rd @@ -18,8 +18,9 @@ download_dataset( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{destination}{The path to the directory where the unzipped file directory should be saved. Default is "scpca_data".} diff --git a/man/get_dataset_detail.Rd b/man/get_dataset_detail.Rd index 2005b35..4f56d9d 100644 --- a/man/get_dataset_detail.Rd +++ b/man/get_dataset_detail.Rd @@ -7,8 +7,9 @@ get_dataset_detail(dataset, auth_token) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{auth_token}{an authorization token obtained from \code{\link[=get_auth]{get_auth()}}; must match the token used to create the dataset.} diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd new file mode 100644 index 0000000..c73a65a --- /dev/null +++ b/man/get_dataset_info.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{get_dataset_info} +\alias{get_dataset_info} +\title{Get a summary of a custom ScPCA dataset} +\usage{ +get_dataset_info(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) +} +\arguments{ +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by this function).} + +\item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the +\code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} +} +\value{ +a named list with the following elements: +\itemize{ +\item \code{id}: the dataset UUID string +\item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") +\item \code{status}: the processing status — one of "pending", "processing", +"succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}}) +\item \code{n_samples}: the number of rows in \code{samples} (one per sample-modality +combination; merged-single-cell projects are not counted) +\item \code{n_projects}: the number of projects in the dataset +\item \code{samples}: a data frame with one row per sample-modality combination and +columns \code{scpca_sample_id}, \code{scpca_project_id}, \code{modality} (character: +"single-cell" or "spatial"), and \code{includes_bulk} (logical) +\item \code{merged_projects}: a character vector of project IDs whose single-cell +data is merged; \code{character(0)} when none +} +} +\description{ +Fetches a custom dataset and returns a structured summary of its contents, +including its processing status and a per-sample table describing the modality for +each sample. +} +\details{ +Projects with merged single-cell data (where individual sample IDs are not +enumerated in the dataset record) are excluded from \code{samples} and listed in +\code{merged_projects} instead. +} +\examples{ +\dontrun{ +ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +info <- get_dataset_info(ds_id) +info$status +info$samples +} +} diff --git a/man/get_dataset_status.Rd b/man/get_dataset_status.Rd index 4a56ef8..3f60353 100644 --- a/man/get_dataset_status.Rd +++ b/man/get_dataset_status.Rd @@ -7,8 +7,9 @@ get_dataset_status(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} @@ -19,21 +20,21 @@ a single character string: one of "pending", "processing", } \description{ Returns a single string describing where a dataset is in the processing -lifecycle, by fetching the dataset detail and translating its status fields -(\code{is_started}, \code{is_succeeded}, \code{is_failed}). A dataset that has been started -but has neither succeeded nor failed is reported as "processing". +lifecycle. } \details{ Possible values are: -\describe{ -\item{"pending"}{the dataset has not been started} -\item{"processing"}{the dataset has been started but is not yet finished} -\item{"succeeded"}{processing finished and the dataset is ready to download} -\item{"failed"}{processing failed} +\itemize{ +\item \code{"pending"}: the dataset has not been started +\item \code{"processing"}: the dataset has been started but is not yet finished +\item \code{"succeeded"}: processing finished and the dataset is ready to download +\item \code{"expired"}: processing completed but the generated download has since +expired and must be regenerated +\item \code{"failed"}: processing failed } } \examples{ \dontrun{ -get_dataset_status(ds) +get_dataset_status(ds_id) } } diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd new file mode 100644 index 0000000..583bba4 --- /dev/null +++ b/man/make_dataset_data_df.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{make_dataset_data_df} +\alias{make_dataset_data_df} +\title{Make a per-sample data frame from the \verb{$data} list} +\usage{ +make_dataset_data_df(data) +} +\arguments{ +\item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}} +} +\value{ +a data frame with columns \code{scpca_sample_id}, \code{scpca_project_id}, +\code{modality}, and \code{includes_bulk} +} +\description{ +Transforms the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}} into a +one-row-per-sample data frame. Projects with merged single-cell data +(\code{SINGLE_CELL = "MERGED"}) are excluded. +} +\keyword{internal} diff --git a/man/modify_dataset_samples.Rd b/man/modify_dataset_samples.Rd index 61604ec..6cfc56d 100644 --- a/man/modify_dataset_samples.Rd +++ b/man/modify_dataset_samples.Rd @@ -36,7 +36,7 @@ projects keep their current value. Default is FALSE.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ \code{add_dataset_samples()} adds the given samples and/or all samples from the diff --git a/man/replace_dataset_data.Rd b/man/replace_dataset_data.Rd index 15afdf5..4b88c6e 100644 --- a/man/replace_dataset_data.Rd +++ b/man/replace_dataset_data.Rd @@ -26,7 +26,7 @@ all samples from each project are included.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Replaces the samples and/or projects in an existing dataset with a new @@ -40,6 +40,6 @@ A dataset that has already started processing cannot be updated. } \examples{ \dontrun{ -replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002")) +replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002")) } } diff --git a/man/resolve_dataset_id.Rd b/man/resolve_dataset_id.Rd index ee7e54c..2588243 100644 --- a/man/resolve_dataset_id.Rd +++ b/man/resolve_dataset_id.Rd @@ -10,11 +10,12 @@ resolve_dataset_id(dataset) \item{dataset}{a dataset UUID string, or a list with an \verb{$id} element} } \value{ -the dataset ID as a length-1 character string +the dataset ID as a character string } \description{ -Accepts either a dataset UUID string or a list with an \verb{$id} element (such as -the return value of \code{\link[=create_dataset]{create_dataset()}} or \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns -the ID string, after checking that it is a valid UUID. +Accepts either a dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}) or a list with an \verb{$id} element (such as the value returned +by \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns the ID string, after checking that it is +a valid UUID. } \keyword{internal} diff --git a/man/set_dataset_email.Rd b/man/set_dataset_email.Rd index cc28f59..a245d01 100644 --- a/man/set_dataset_email.Rd +++ b/man/set_dataset_email.Rd @@ -15,7 +15,7 @@ set_dataset_email(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Updates the email address the ScPCA Portal will use to notify you when the @@ -27,6 +27,6 @@ A dataset that has already been started cannot be modified. } \examples{ \dontrun{ -set_dataset_email(ds, email = "user@example.com") +set_dataset_email(ds_id, email = "user@example.com") } } diff --git a/man/start_dataset_processing.Rd b/man/start_dataset_processing.Rd index b61909e..fd7a61b 100644 --- a/man/start_dataset_processing.Rd +++ b/man/start_dataset_processing.Rd @@ -11,8 +11,9 @@ start_dataset_processing( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{email}{optional email address for the download notification. When supplied, it is set as part of the same request that starts processing.} @@ -21,7 +22,7 @@ supplied, it is set as part of the same request that starts processing.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Starts processing of an existing custom dataset so that its files can be @@ -29,12 +30,18 @@ built for download, by sending a PUT request that sets \code{start = TRUE}. Optionally sets the notification email as part of the same request. } \details{ -Once processing has started a dataset is locked and can no longer be -modified; attempting to modify or re-start it will raise an error. +Before sending the request the current dataset status is checked via +\code{\link[=get_dataset_status]{get_dataset_status()}}: +\itemize{ +\item A \code{"pending"} or \code{"expired"} dataset is started normally. +\item A \code{"failed"} dataset is retried with a warning. +\item A \code{"processing"} or \code{"succeeded"} dataset is already underway or done; +a message is emitted and no request is sent. +} } \examples{ \dontrun{ -ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) -start_dataset_processing(ds, email = "user@example.com") +ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +start_dataset_processing(ds_id, email = "user@example.com") } } From 0c9d05a2883fd63476e00613c8f94d26da620311 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 09:55:21 -0400 Subject: [PATCH 08/19] Revert test removals --- tests/testthat/test-datasets.R | 257 ++++++++++++++++++++++++++++++--- 1 file changed, 236 insertions(+), 21 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index b4cbfde..6b640eb 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -32,6 +32,115 @@ test_that("get_ccdl_datasets combines results across pages", { }) }) +test_that("get_ccdl_datasets passes project_id as ccdl_project_id query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(project_id = "SCPCP000001") + expect_match(captured_req$url, "ccdl_project_id=SCPCP000001") +}) + +test_that("get_ccdl_datasets passes modality as ccdl_modality query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(modality = "SINGLE_CELL") + expect_match(captured_req$url, "ccdl_modality=SINGLE_CELL") +}) + +test_that("get_ccdl_datasets passes format as format query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(format = "ANN_DATA") + expect_match(captured_req$url, "format=ANN_DATA") +}) + +test_that("get_ccdl_datasets passes merged as ccdl_is_merged query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(merged = TRUE) + expect_match(captured_req$url, "ccdl_is_merged=TRUE") +}) + +test_that("get_ccdl_datasets passes include_multiplexed as includes_files_multiplexed query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(include_multiplexed = TRUE) + expect_match(captured_req$url, "includes_files_multiplexed=TRUE") + + get_ccdl_datasets(include_multiplexed = FALSE) + expect_match(captured_req$url, "includes_files_multiplexed=FALSE") +}) + +test_that("get_ccdl_datasets passes metadata_only as ccdl_name=ALL_METADATA query parameter", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(metadata_only = TRUE) + expect_match(captured_req$url, "ccdl_name=ALL_METADATA") +}) + +test_that("get_ccdl_datasets includes api-key header when auth_token is provided", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets(auth_token = "test-token-abc") + expect_equal( + httr2::req_get_headers(captured_req, "reveal")$`api-key`, + "test-token-abc" + ) +}) + +test_that("get_ccdl_datasets does not include api-key header when auth_token is empty", { + captured_req <- NULL + local_mocked_bindings( + req_perform_iterative = function(req, ...) { + captured_req <<- req + list() + } + ) + + get_ccdl_datasets() + expect_null(httr2::req_get_headers(captured_req, "reveal")$`api-key`) +}) # build_dataset_data tests @@ -158,7 +267,6 @@ test_that("create_dataset errors when spatial format is requested", { }) test_that("create_dataset POSTs with start = FALSE", { - captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) { list( @@ -170,8 +278,8 @@ test_that("create_dataset POSTs with start = FALSE", { ) }, req_perform = \(req, ...) { - captured_req <<- req - json_response(c(req$body$data, list(id = "new-dataset-uuid"))) + body <- req$body$data + json_response(c(body, list(id = "new-dataset-uuid"))) } ) @@ -182,11 +290,10 @@ test_that("create_dataset POSTs with start = FALSE", { }, "new-dataset-uuid" ) - expect_false(captured_req$body$data$start) - expect_equal(result, "new-dataset-uuid") + expect_false(result$start) }) -test_that("create_dataset returns id invisibly and messages with dataset id", { +test_that("create_dataset returns response invisibly and messages with dataset id", { local_mocked_bindings( build_dataset_data = \(...) { list( @@ -212,27 +319,85 @@ test_that("create_dataset returns id invisibly and messages with dataset id", { }, "new-dataset-uuid" ) - expect_equal(result, "new-dataset-uuid") + expect_equal(result$id, "new-dataset-uuid") }) test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", { withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token") - captured_key <- NULL local_mocked_bindings( build_dataset_data = \(...) list(), req_perform = \(req, ...) { - captured_key <<- httr2::req_get_headers(req, "reveal")$`api-key` - json_response(list(id = "new-dataset-uuid")) + json_response(list( + id = "new-dataset-uuid", + api_key = httr2::req_get_headers(req, "reveal")$`api-key` + )) } ) # called without auth_token; the token should come from the environment - suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) - expect_equal(captured_key, "env-token") + result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) + expect_equal(result$api_key, "env-token") }) # get_dataset_detail tests +test_that("get_dataset_detail returns dataset with data and status fields", { + local_mocked_bindings( + req_perform = \(req, ...) { + json_response(list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), + SPATIAL = list(), + includes_bulk = FALSE + ) + ), + is_started = FALSE, + is_succeeded = FALSE, + total_sample_count = 2, + computed_file = NULL + )) + } + ) + + result <- get_dataset_detail(DATASET_ID, auth_token = "test-token") + + expect_type(result, "list") + expect_equal(result$id, DATASET_ID) + expect_equal(result$format, "SINGLE_CELL_EXPERIMENT") + expect_false(result$is_started) + expect_false(result$is_succeeded) +}) + +test_that("get_dataset_detail returns data field with project and sample structure", { + local_mocked_bindings( + req_perform = \(req, ...) { + json_response(list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), + SPATIAL = list(), + includes_bulk = FALSE + ) + ) + )) + } + ) + + result <- get_dataset_detail(DATASET_ID, auth_token = "test-token") + + expect_type(result$data, "list") + expect_true("SCPCP000001" %in% names(result$data)) + expect_contains( + result$data$SCPCP000001$SINGLE_CELL, + c("SCPCS000001", "SCPCS000002") + ) +}) + test_that("get_dataset_detail includes api-key header when auth_token is provided", { local_mocked_bindings( req_perform = \(req, ...) { @@ -285,6 +450,32 @@ test_that("get_ccdl_datasets handles 403 errors with an authorization message", ) }) +test_that("get_dataset_detail accepts a list with $id in place of a string", { + local_mocked_bindings( + req_perform = \(req, ...) { + json_response(list(id = DATASET_ID, data = list())) + } + ) + + dataset_list <- list(id = DATASET_ID, data = list()) + result <- get_dataset_detail(dataset_list, auth_token = "test-token") + expect_equal(result$id, DATASET_ID) +}) + +test_that("get_dataset_detail errors when list has no $id element", { + expect_error( + get_dataset_detail(list(data = list()), auth_token = "test-token"), + "dataset must be an id string or contain an \\$id element" + ) +}) + +test_that("get_dataset_detail errors when dataset is not a string or list", { + expect_error( + get_dataset_detail(123, auth_token = "test-token"), + "dataset must be an id string or contain an \\$id element" + ) +}) + # get_dataset_status tests test_that("get_dataset_status maps detail status fields to a status string", { @@ -584,7 +775,7 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { }, req_perform = \(req, ...) { captured_req <<- req - json_response(list(id = DATASET_ID, data = req$body$data)) + json_response(req$body$data) } ) @@ -596,8 +787,8 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_null(captured_req$body$data$format) - expect_equal(result, DATASET_ID) + expect_null(result$format) + expect_true("SCPCP000001" %in% names(result$data)) }) # set_dataset_email tests @@ -607,7 +798,7 @@ test_that("set_dataset_email PUTs a new email", { local_mocked_bindings( req_perform = \(req, ...) { captured_req <<- req - json_response(list(id = DATASET_ID, email = req$body$data$email)) + json_response(req$body$data) } ) @@ -618,8 +809,7 @@ test_that("set_dataset_email PUTs a new email", { ) expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_equal(captured_req$body$data$email, "user@example.com") - expect_equal(result, DATASET_ID) + expect_equal(result$email, "user@example.com") }) test_that("set_dataset_email errors when email is not a single string", { @@ -656,7 +846,7 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { get_dataset_status = \(dataset, auth_token) "pending", req_perform = \(req, ...) { captured_req <<- req - json_response(list(id = DATASET_ID)) + json_response(req$body$data) } ) @@ -672,7 +862,30 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { ) expect_equal(captured_req$method, "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_equal(result, DATASET_ID) + expect_true(result$start) + expect_null(result$email) +}) + +test_that("start_dataset_processing includes email in the same request when provided", { + captured_req <- NULL + local_mocked_bindings( + get_dataset_status = \(dataset, auth_token) "pending", + req_perform = \(req, ...) { + captured_req <<- req + json_response(req$body$data) + } + ) + + result <- suppressMessages( + start_dataset_processing( + DATASET_ID, + email = "user@example.com", + auth_token = "token" + ) + ) + expect_equal(captured_req$method, "PUT") + expect_true(result$start) + expect_equal(result$email, "user@example.com") }) test_that("start_dataset_processing errors when email is not a single string", { @@ -707,7 +920,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "is already processing" ) - expect_equal(result, DATASET_ID) + expect_null(result) expect_false(put_called) }) @@ -746,6 +959,7 @@ test_that("start_dataset_processing warns and retries when previously failed", { "previously failed to process" ) expect_equal(captured_req$method, "PUT") + expect_true(captured_req$body$data$start) }) test_that("start_dataset_processing restarts an expired dataset", { @@ -762,6 +976,7 @@ test_that("start_dataset_processing restarts an expired dataset", { start_dataset_processing(DATASET_ID, auth_token = "token") ) expect_equal(captured_req$method, "PUT") + expect_true(captured_req$body$data$start) }) test_that("start_dataset_processing surfaces a locked-dataset error on a 409 race", { From e1f96cbb2da053edeafba631fb285f44de6eada8 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 10:00:10 -0400 Subject: [PATCH 09/19] ignore claude directory for rbuild --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index d3bd575..a90605e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,6 +2,7 @@ ^LICENSE\.md$ ^ScPCAr\.Rproj$ ^\.Rproj\.user$ +^\.claude$ ^\.github$ ^\.pre-commit-config\.yaml$ ^_pkgdown\.yml$ From fcc337f7e3477137a0d538f9b90317406e87a90d Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 10:12:06 -0400 Subject: [PATCH 10/19] standardize testing with more detail --- tests/testthat/test-datasets.R | 55 +++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index 6b640eb..bc70cea 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -267,6 +267,7 @@ test_that("create_dataset errors when spatial format is requested", { }) test_that("create_dataset POSTs with start = FALSE", { + captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) { list( @@ -278,8 +279,8 @@ test_that("create_dataset POSTs with start = FALSE", { ) }, req_perform = \(req, ...) { - body <- req$body$data - json_response(c(body, list(id = "new-dataset-uuid"))) + captured_req <<- req + json_response(list(id = "new-dataset-uuid")) } ) @@ -290,10 +291,12 @@ test_that("create_dataset POSTs with start = FALSE", { }, "new-dataset-uuid" ) - expect_false(result$start) + expect_equal(httr2::req_get_method(captured_req), "POST") + expect_false(captured_req$body$data$start) + expect_equal(result, "new-dataset-uuid") }) -test_that("create_dataset returns response invisibly and messages with dataset id", { +test_that("create_dataset returns the dataset id invisibly and messages with dataset id", { local_mocked_bindings( build_dataset_data = \(...) { list( @@ -319,24 +322,24 @@ test_that("create_dataset returns response invisibly and messages with dataset i }, "new-dataset-uuid" ) - expect_equal(result$id, "new-dataset-uuid") + expect_equal(result, "new-dataset-uuid") }) test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", { withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token") + captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) list(), req_perform = \(req, ...) { - json_response(list( - id = "new-dataset-uuid", - api_key = httr2::req_get_headers(req, "reveal")$`api-key` - )) + captured_req <<- req + json_response(list(id = "new-dataset-uuid")) } ) # called without auth_token; the token should come from the environment result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) - expect_equal(result$api_key, "env-token") + expect_equal(httr2::req_get_headers(captured_req, "reveal")$`api-key`, "env-token") + expect_equal(result, "new-dataset-uuid") }) # get_dataset_detail tests @@ -785,10 +788,11 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { samples = "SCPCS000001" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_null(result$format) - expect_true("SCPCP000001" %in% names(result$data)) + expect_null(captured_req$body$data$format) + expect_true("SCPCP000001" %in% names(captured_req$body$data$data)) + expect_equal(result, DATASET_ID) }) # set_dataset_email tests @@ -807,9 +811,10 @@ test_that("set_dataset_email PUTs a new email", { auth_token = "token", email = "user@example.com" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_equal(result$email, "user@example.com") + expect_equal(captured_req$body$data$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("set_dataset_email errors when email is not a single string", { @@ -860,10 +865,11 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { }, "processing started" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_true(result$start) - expect_null(result$email) + expect_true(captured_req$body$data$start) + expect_null(captured_req$body$data$email) + expect_equal(result, DATASET_ID) }) test_that("start_dataset_processing includes email in the same request when provided", { @@ -883,9 +889,10 @@ test_that("start_dataset_processing includes email in the same request when prov auth_token = "token" ) ) - expect_equal(captured_req$method, "PUT") - expect_true(result$start) - expect_equal(result$email, "user@example.com") + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_true(captured_req$body$data$start) + expect_equal(captured_req$body$data$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("start_dataset_processing errors when email is not a single string", { @@ -920,7 +927,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "is already processing" ) - expect_null(result) + expect_equal(result, DATASET_ID) expect_false(put_called) }) @@ -958,7 +965,7 @@ test_that("start_dataset_processing warns and retries when previously failed", { ), "previously failed to process" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_true(captured_req$body$data$start) }) @@ -975,7 +982,7 @@ test_that("start_dataset_processing restarts an expired dataset", { suppressMessages( start_dataset_processing(DATASET_ID, auth_token = "token") ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_true(captured_req$body$data$start) }) From d7014df7a689ef8350bc1bea87a0856f7bbf07a2 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 10:34:14 -0400 Subject: [PATCH 11/19] fix modailty test: can't have one sample with both single cell and spatial! --- tests/testthat/test-datasets.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index bc70cea..81f5b80 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -570,7 +570,7 @@ test_that("get_dataset_info returns structured summary with samples data frame", ), SCPCP000002 = list( SINGLE_CELL = list("SCPCS000003"), - SPATIAL = list("SCPCS000003"), + SPATIAL = list("SCPCS000004"), includes_bulk = TRUE ) ) @@ -584,7 +584,7 @@ test_that("get_dataset_info returns structured summary with samples data frame", expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") expect_equal(info$status, "pending") expect_equal(info$n_projects, 2) - # SCPCS000003 appears in both modalities: 2 SC rows + 1 spatial = 4 rows total + # 4 sample rows: 2 single-cell in the first project, 1 single-cell + 1 spatial in the second expect_equal(info$n_samples, 4) expect_equal(info$merged_projects, character(0)) expect_s3_class(info$samples, "data.frame") @@ -592,9 +592,15 @@ test_that("get_dataset_info returns structured summary with samples data frame", colnames(info$samples), c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk") ) - # SCPCS000003 has two rows — one per modality - rows_003 <- info$samples[info$samples$scpca_sample_id == "SCPCS000003", ] - expect_setequal(rows_003$modality, c("single-cell", "spatial")) + # single-cell and spatial samples are distinct, each with its own modality row + expect_equal( + info$samples$modality[info$samples$scpca_sample_id == "SCPCS000003"], + "single-cell" + ) + expect_equal( + info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"], + "spatial" + ) # SCPCP000001 samples should not have includes_bulk rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ] expect_false(all(rows_p1$includes_bulk)) From e609cd05056264741331ca5635b0bb7ae3049e2b Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 11:21:12 -0400 Subject: [PATCH 12/19] get full sample size --- R/datasets.R | 34 +++++++++++++++++---------- tests/testthat/test-datasets.R | 42 +++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 2665f50..ea53bd2 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -631,13 +631,12 @@ remove_dataset_samples <- function( #' @importFrom dplyr .data #' #' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`, -#' `modality`, and `includes_bulk` +#' and `modality` make_dataset_data_df <- function(data) { empty <- data.frame( scpca_sample_id = character(), scpca_project_id = character(), - modality = character(), - includes_bulk = logical() + modality = character() ) if (length(data) == 0) { return(empty) @@ -645,7 +644,6 @@ make_dataset_data_df <- function(data) { result <- data |> purrr::imap(\(project, project_id) { - includes_bulk <- isTRUE(project$includes_bulk) single_cell_ids <- project$SINGLE_CELL # Datasets created outside this package may be merged. # projects are excluded here and surfaced via `merged_projects` in @@ -665,8 +663,7 @@ make_dataset_data_df <- function(data) { modality = rep( c("single-cell", "spatial"), times = c(length(sc_ids), length(sp_ids)) - ), - includes_bulk = includes_bulk + ) ) }) |> purrr::list_rbind() |> @@ -697,14 +694,20 @@ make_dataset_data_df <- function(data) { #' * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") #' * `status`: the processing status — one of "pending", "processing", #' "succeeded", "failed", or "expired" (see [get_dataset_status()]) -#' * `n_samples`: the number of rows in `samples` (one per sample-modality -#' combination; merged-single-cell projects are not counted) +#' * `n_samples`: the total number of samples in the dataset, taken from the +#' API's `total_sample_count`. This includes samples in merged projects, +#' which are not enumerated in `samples`, so `n_samples` can exceed +#' `nrow(samples)`. #' * `n_projects`: the number of projects in the dataset #' * `samples`: a data frame with one row per sample-modality combination and -#' columns `scpca_sample_id`, `scpca_project_id`, `modality` (character: -#' "single-cell" or "spatial"), and `includes_bulk` (logical) +#' columns `scpca_sample_id`, `scpca_project_id`, and `modality` (character: +#' "single-cell" or "spatial") #' * `merged_projects`: a character vector of project IDs whose single-cell #' data is merged; `character(0)` when none +#' * `bulk_projects`: a character vector of project IDs that include bulk +#' RNA-seq data; `character(0)` when none. Bulk inclusion is recorded per +#' project rather than per sample, so it is reported here rather than in +#' `samples`. #' #' @import httr2 #' @export @@ -725,15 +728,22 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN" purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |> names() |> as.character() + bulk_projects <- detail$data |> + purrr::keep(\(p) isTRUE(p$includes_bulk)) |> + names() |> + as.character() list( id = detail$id, format = detail$format, status = dataset_status_from_detail(detail), - n_samples = nrow(samples), + # total_sample_count comes from the API and counts all samples, including + # those in merged projects that are not enumerated in `samples`. + n_samples = detail$total_sample_count, n_projects = length(detail$data), samples = samples, - merged_projects = merged_projects + merged_projects = merged_projects, + bulk_projects = bulk_projects ) } diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index 81f5b80..8d559ea 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -573,7 +573,8 @@ test_that("get_dataset_info returns structured summary with samples data frame", SPATIAL = list("SCPCS000004"), includes_bulk = TRUE ) - ) + ), + total_sample_count = 4 ) } ) @@ -584,13 +585,13 @@ test_that("get_dataset_info returns structured summary with samples data frame", expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") expect_equal(info$status, "pending") expect_equal(info$n_projects, 2) - # 4 sample rows: 2 single-cell in the first project, 1 single-cell + 1 spatial in the second + # n_samples comes from the API total_sample_count (here equal to the 4 enumerated rows) expect_equal(info$n_samples, 4) expect_equal(info$merged_projects, character(0)) expect_s3_class(info$samples, "data.frame") expect_setequal( colnames(info$samples), - c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk") + c("scpca_sample_id", "scpca_project_id", "modality") ) # single-cell and spatial samples are distinct, each with its own modality row expect_equal( @@ -601,12 +602,8 @@ test_that("get_dataset_info returns structured summary with samples data frame", info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"], "spatial" ) - # SCPCP000001 samples should not have includes_bulk - rows_p1 <- info$samples[info$samples$scpca_project_id == "SCPCP000001", ] - expect_false(all(rows_p1$includes_bulk)) - # SCPCP000002 samples should have includes_bulk - rows_p2 <- info$samples[info$samples$scpca_project_id == "SCPCP000002", ] - expect_true(all(rows_p2$includes_bulk)) + # bulk inclusion is reported per project, not per sample + expect_equal(info$bulk_projects, "SCPCP000002") }) test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", { @@ -616,7 +613,8 @@ test_that("get_dataset_info returns empty samples data frame with correct schema id = DATASET_ID, format = "ANN_DATA", is_started = FALSE, - data = list() + data = list(), + total_sample_count = 0 ) } ) @@ -626,9 +624,10 @@ test_that("get_dataset_info returns empty samples data frame with correct schema expect_equal(info$n_samples, 0) expect_equal(info$n_projects, 0) expect_equal(nrow(info$samples), 0) + expect_equal(info$bulk_projects, character(0)) expect_setequal( colnames(info$samples), - c("scpca_sample_id", "scpca_project_id", "modality", "includes_bulk") + c("scpca_sample_id", "scpca_project_id", "modality") ) }) @@ -650,17 +649,22 @@ test_that("get_dataset_info surfaces merged projects separately and excludes the SPATIAL = list(), includes_bulk = FALSE ) - ) + ), + # SCPCP000001 contributes 1 enumerated sample; the merged SCPCP000005 + # contributes 3 samples that are not enumerated in `data` + total_sample_count = 4 ) } ) info <- get_dataset_info(DATASET_ID, auth_token = "token") - # merged project excluded from samples and n_samples - expect_equal(info$n_samples, 1) + # merged project's samples are not enumerated in the samples table + expect_equal(nrow(info$samples), 1) expect_equal(info$samples$scpca_sample_id, "SCPCS000001") - # but counted in n_projects and surfaced in merged_projects + # but n_samples uses the API total_sample_count, which counts them + expect_equal(info$n_samples, 4) + # merged project counted in n_projects and surfaced in merged_projects expect_equal(info$n_projects, 2) expect_equal(info$merged_projects, "SCPCP000005") }) @@ -675,7 +679,8 @@ test_that("get_dataset_info derives status from detail without a second API call format = "ANN_DATA", is_started = TRUE, is_succeeded = TRUE, - data = list() + data = list(), + total_sample_count = 0 ) } ) @@ -704,14 +709,15 @@ test_that("get_dataset_info prunes projects where both modality lists are empty" SPATIAL = list(), includes_bulk = FALSE ) - ) + ), + total_sample_count = 1 ) } ) info <- get_dataset_info(DATASET_ID, auth_token = "token") - expect_equal(info$n_samples, 1) + expect_equal(nrow(info$samples), 1) expect_equal(info$samples$scpca_project_id, "SCPCP000001") expect_false("SCPCP000002" %in% info$samples$scpca_project_id) }) From 8e277ff019af434740b7b9ce44cc2d2422795950 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 17:43:11 -0400 Subject: [PATCH 13/19] Give a more complete sample table --- R/datasets.R | 142 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 92 insertions(+), 50 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index ea53bd2..8503c55 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -619,24 +619,38 @@ remove_dataset_samples <- function( } -#' Make a per-sample data frame from the `$data` list -#' -#' Transforms the project-keyed `$data` list from [get_dataset_detail()] into a -#' one-row-per-sample data frame. Projects with merged single-cell data -#' (`SINGLE_CELL = "MERGED"`) are excluded. +#' Build the per-sample data frame for a dataset +#' +#' For each project in the dataset `$data` list, fetches the project's sample +#' metadata with [get_project_samples()] and keeps only the samples the dataset includes: +#' for a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, +#' and for a merged project, all of the project's single-cell samples. +#' Each modality is reported only when it is requested for the sample: +#' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the +#' sample is not included as single-cell), +#' `has_spatial` marks spatial inclusion +#' `has_bulk` reflects the project's `includes_bulk` request +#' intersected with whether the sample actually has bulk data. +#' `has_cite_seq` and `has_multiplexed` come from the sample records. #' #' @param data the project-keyed `$data` list from [get_dataset_detail()] #' #' @keywords internal #' @importFrom dplyr .data #' -#' @returns a data frame with columns `scpca_sample_id`, `scpca_project_id`, -#' and `modality` +#' @returns a data frame with one row per included sample and columns +#' `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character: "cell", +#' "nucleus", or `NA`), `has_spatial`, `has_bulk`, `has_cite_seq`, and +#' `has_multiplexed` (all logical) make_dataset_data_df <- function(data) { - empty <- data.frame( + empty <- tibble::tibble( scpca_sample_id = character(), scpca_project_id = character(), - modality = character() + seq_unit = character(), + has_spatial = logical(), + has_bulk = logical(), + has_cite_seq = logical(), + has_multiplexed = logical() ) if (length(data) == 0) { return(empty) @@ -644,27 +658,62 @@ make_dataset_data_df <- function(data) { result <- data |> purrr::imap(\(project, project_id) { - single_cell_ids <- project$SINGLE_CELL - # Datasets created outside this package may be merged. - # projects are excluded here and surfaced via `merged_projects` in - # get_dataset_info() instead. - if (identical(single_cell_ids, "MERGED")) { - return(NULL) - } - sc_ids <- as.character(single_cell_ids) - sp_ids <- as.character(project$SPATIAL) - if (length(sc_ids) == 0 && length(sp_ids) == 0) { - return(NULL) + merged <- identical(project$SINGLE_CELL, "MERGED") + + # The project's sample metadata has the modality details we will need. + project_samples <- get_project_samples(project_id, simplify = FALSE) + + # Get single cell samples for the project: + # - if merged from the projeect_samples metadata + # - if not merged, from the request list. + if (merged) { + single_cell_ids <- project_samples$scpca_sample_id[ + project_samples$has_single_cell_data + ] + } else { + single_cell_ids <- as.character(project$SINGLE_CELL) } - data.frame( - scpca_sample_id = c(sc_ids, sp_ids), - scpca_project_id = project_id, - modality = rep( - c("single-cell", "spatial"), - times = c(length(sc_ids), length(sp_ids)) + spatial_ids <- as.character(project$SPATIAL) + included_ids <- union(single_cell_ids, spatial_ids) + requested_bulk <- isTRUE(project$includes_bulk) + + project_samples |> + # keep only the samples the dataset requests for this project + dplyr::filter(.data$scpca_sample_id %in% included_ids) |> + dplyr::mutate( + scpca_project_id = project_id, + # the single-cell sequencing unit (cell or nucleus), or NA when + # single-cell is not requested for the sample + seq_unit = purrr::map2_chr( + .data$seq_units, + .data$scpca_sample_id, + \(units, sample_id) { + if (!sample_id %in% single_cell_ids) { + return(NA_character_) + } + # get only the nucleus or cell (not spot or bulk) + # if both are present (unlikely), combine with a comma + intersect(c("cell", "nucleus"), as.character(units)) |> + paste(collapse = ",") + } + ), + # only modalities requested for the sample are reported; has_bulk also + # requires the sample to actually have bulk data + has_spatial = .data$scpca_sample_id %in% spatial_ids, + has_bulk = requested_bulk & .data$has_bulk_rna_seq, + has_cite_seq = .data$has_cite_seq_data, + has_multiplexed = .data$has_multiplexed_data + ) |> + dplyr::select( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" ) - ) }) |> purrr::list_rbind() |> dplyr::arrange(.data$scpca_sample_id) @@ -676,12 +725,14 @@ make_dataset_data_df <- function(data) { #' Get a summary of a custom ScPCA dataset #' #' Fetches a custom dataset and returns a structured summary of its contents, -#' including its processing status and a per-sample table describing the modality for -#' each sample. +#' including its processing status and a per-sample table describing the modality +#' of each sample. #' -#' Projects with merged single-cell data (where individual sample IDs are not -#' enumerated in the dataset record) are excluded from `samples` and listed in -#' `merged_projects` instead. +#' For each project, the included samples and their modality details are looked +#' up from the project's sample records (one request per project), so merged +#' projects (whose individual sample IDs are not enumerated in the dataset +#' record) are expanded to all of their single-cell samples. Projects whose +#' single-cell data is merged are also listed in `merged_projects`. #' #' @param dataset the dataset UUID string (such as the value returned by #' [create_dataset()]), or a list with an `$id` element (such as the value @@ -695,19 +746,16 @@ make_dataset_data_df <- function(data) { #' * `status`: the processing status — one of "pending", "processing", #' "succeeded", "failed", or "expired" (see [get_dataset_status()]) #' * `n_samples`: the total number of samples in the dataset, taken from the -#' API's `total_sample_count`. This includes samples in merged projects, -#' which are not enumerated in `samples`, so `n_samples` can exceed -#' `nrow(samples)`. +#' API's `total_sample_count` #' * `n_projects`: the number of projects in the dataset -#' * `samples`: a data frame with one row per sample-modality combination and -#' columns `scpca_sample_id`, `scpca_project_id`, and `modality` (character: -#' "single-cell" or "spatial") +#' * `samples`: a data frame with one row per included sample and columns +#' `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character; the +#' single-cell sequencing unit "cell" or "nucleus", or `NA` when the sample +#' is not included as single-cell), `has_spatial`, `has_bulk` (whether the +#' dataset request includes bulk for that sample), `has_cite_seq`, and +#' `has_multiplexed` (all logical) #' * `merged_projects`: a character vector of project IDs whose single-cell #' data is merged; `character(0)` when none -#' * `bulk_projects`: a character vector of project IDs that include bulk -#' RNA-seq data; `character(0)` when none. Bulk inclusion is recorded per -#' project rather than per sample, so it is reported here rather than in -#' `samples`. #' #' @import httr2 #' @export @@ -728,22 +776,16 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN" purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |> names() |> as.character() - bulk_projects <- detail$data |> - purrr::keep(\(p) isTRUE(p$includes_bulk)) |> - names() |> - as.character() list( id = detail$id, format = detail$format, status = dataset_status_from_detail(detail), - # total_sample_count comes from the API and counts all samples, including - # those in merged projects that are not enumerated in `samples`. + # total_sample_count comes from the API and counts all samples in the dataset. n_samples = detail$total_sample_count, n_projects = length(detail$data), samples = samples, - merged_projects = merged_projects, - bulk_projects = bulk_projects + merged_projects = merged_projects ) } From 5e91a291cdbb2c5d8971d2e1e189da3b6d5f39ac Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 17:43:36 -0400 Subject: [PATCH 14/19] update dataset tests --- tests/testthat/test-datasets.R | 189 ++++++++++++++++++++++++++++----- 1 file changed, 163 insertions(+), 26 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index 8d559ea..af45212 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -554,7 +554,7 @@ test_that("get_dataset_status errors when auth_token is empty", { # get_dataset_info tests -test_that("get_dataset_info returns structured summary with samples data frame", { +test_that("get_dataset_info builds a per-sample table from project sample data", { local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { list( @@ -576,6 +576,32 @@ test_that("get_dataset_info returns structured summary with samples data frame", ), total_sample_count = 4 ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + if (project_id == "SCPCP000001") { + # SCPCS000099 belongs to the project but is not in the dataset request + tibble::tibble( + scpca_sample_id = c("SCPCS000001", "SCPCS000002", "SCPCS000099"), + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell", "cell", "cell") + ) + } else { + tibble::tibble( + scpca_sample_id = c("SCPCS000003", "SCPCS000004"), + scpca_project_id = project_id, + has_single_cell_data = c(TRUE, FALSE), + has_spatial_data = c(FALSE, TRUE), + has_bulk_rna_seq = c(TRUE, FALSE), + has_cite_seq_data = c(TRUE, FALSE), + has_multiplexed_data = c(FALSE, FALSE), + seq_units = list(c("cell", "bulk"), "spot") + ) + } } ) @@ -585,25 +611,83 @@ test_that("get_dataset_info returns structured summary with samples data frame", expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") expect_equal(info$status, "pending") expect_equal(info$n_projects, 2) - # n_samples comes from the API total_sample_count (here equal to the 4 enumerated rows) expect_equal(info$n_samples, 4) expect_equal(info$merged_projects, character(0)) + expect_null(info$bulk_projects) expect_s3_class(info$samples, "data.frame") expect_setequal( colnames(info$samples), - c("scpca_sample_id", "scpca_project_id", "modality") - ) - # single-cell and spatial samples are distinct, each with its own modality row - expect_equal( - info$samples$modality[info$samples$scpca_sample_id == "SCPCS000003"], - "single-cell" + c( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" + ) ) - expect_equal( - info$samples$modality[info$samples$scpca_sample_id == "SCPCS000004"], - "spatial" + # one row per included sample; the unrequested SCPCS000099 is filtered out + expect_equal(nrow(info$samples), 4) + expect_false("SCPCS000099" %in% info$samples$scpca_sample_id) + + field <- \(col, id) info$samples[[col]][info$samples$scpca_sample_id == id] + # seq_unit is the single-cell unit, or NA for a spatial-only sample + expect_equal(field("seq_unit", "SCPCS000001"), "cell") + expect_equal(field("seq_unit", "SCPCS000003"), "cell") + expect_true(is.na(field("seq_unit", "SCPCS000004"))) + + # only requested modalities are reported + expect_true(field("has_spatial", "SCPCS000004")) + expect_false(field("has_spatial", "SCPCS000001")) + + expect_true(field("has_cite_seq", "SCPCS000003")) + expect_false(field("has_cite_seq", "SCPCS000001")) + + # has_bulk reflects the request AND availability + expect_true(field("has_bulk", "SCPCS000003")) # requested + available + expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk + expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none + expect_false(any(info$samples$has_multiplexed)) +}) + +test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list("SCPCS000001"), + includes_bulk = FALSE + ) + ), + total_sample_count = 1 + ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = TRUE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list(c("cell", "spot")) + ) + } ) - # bulk inclusion is reported per project, not per sample - expect_equal(info$bulk_projects, "SCPCP000002") + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + # one row for the sample: single-cell unit plus spatial + expect_equal(nrow(info$samples), 1) + expect_equal(info$samples$seq_unit, "cell") + expect_true(info$samples$has_spatial) }) test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", { @@ -624,14 +708,22 @@ test_that("get_dataset_info returns empty samples data frame with correct schema expect_equal(info$n_samples, 0) expect_equal(info$n_projects, 0) expect_equal(nrow(info$samples), 0) - expect_equal(info$bulk_projects, character(0)) + expect_null(info$bulk_projects) expect_setequal( colnames(info$samples), - c("scpca_sample_id", "scpca_project_id", "modality") + c( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" + ) ) }) -test_that("get_dataset_info surfaces merged projects separately and excludes them from samples", { +test_that("get_dataset_info expands merged projects to all their single-cell samples", { local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { list( @@ -650,23 +742,55 @@ test_that("get_dataset_info surfaces merged projects separately and excludes the includes_bulk = FALSE ) ), - # SCPCP000001 contributes 1 enumerated sample; the merged SCPCP000005 - # contributes 3 samples that are not enumerated in `data` total_sample_count = 4 ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + if (project_id == "SCPCP000001") { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell") + ) + } else { + # merged project: all single-cell samples are included; the + # non-single-cell SCPCS000053 is not + tibble::tibble( + scpca_sample_id = c("SCPCS000050", "SCPCS000051", "SCPCS000052", "SCPCS000053"), + scpca_project_id = project_id, + has_single_cell_data = c(TRUE, TRUE, TRUE, FALSE), + has_spatial_data = c(FALSE, FALSE, FALSE, TRUE), + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell", "cell", "nucleus", "spot") + ) + } } ) info <- get_dataset_info(DATASET_ID, auth_token = "token") - # merged project's samples are not enumerated in the samples table - expect_equal(nrow(info$samples), 1) - expect_equal(info$samples$scpca_sample_id, "SCPCS000001") - # but n_samples uses the API total_sample_count, which counts them - expect_equal(info$n_samples, 4) - # merged project counted in n_projects and surfaced in merged_projects - expect_equal(info$n_projects, 2) + # merged project still surfaced in merged_projects expect_equal(info$merged_projects, "SCPCP000005") + # its single-cell samples are expanded into the table; SCPCS000053 is excluded + expect_setequal( + info$samples$scpca_sample_id, + c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052") + ) + expect_false("SCPCS000053" %in% info$samples$scpca_sample_id) + # the nucleus seq_unit is reported for that sample + expect_equal( + info$samples$seq_unit[info$samples$scpca_sample_id == "SCPCS000052"], + "nucleus" + ) + expect_equal(info$n_projects, 2) + expect_equal(info$n_samples, 4) }) test_that("get_dataset_info derives status from detail without a second API call", { @@ -691,7 +815,7 @@ test_that("get_dataset_info derives status from detail without a second API call expect_equal(info$status, "succeeded") }) -test_that("get_dataset_info prunes projects where both modality lists are empty", { +test_that("get_dataset_info prunes projects where nothing is requested", { local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { list( @@ -712,6 +836,19 @@ test_that("get_dataset_info prunes projects where both modality lists are empty" ), total_sample_count = 1 ) + }, + # only SCPCP000001 should be queried; SCPCP000002 requests nothing + get_project_samples = \(project_id, simplify = TRUE) { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell") + ) } ) From 8ce07981a15f028f2e8ce4dd593e3a5bea3aa5c3 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 17:59:11 -0400 Subject: [PATCH 15/19] update docs --- R/datasets.R | 14 ++++++++------ man/get_dataset_info.Rd | 29 +++++++++++++++++++---------- man/make_dataset_data_df.Rd | 22 ++++++++++++++++------ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 8503c55..766520d 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -748,12 +748,14 @@ make_dataset_data_df <- function(data) { #' * `n_samples`: the total number of samples in the dataset, taken from the #' API's `total_sample_count` #' * `n_projects`: the number of projects in the dataset -#' * `samples`: a data frame with one row per included sample and columns -#' `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character; the -#' single-cell sequencing unit "cell" or "nucleus", or `NA` when the sample -#' is not included as single-cell), `has_spatial`, `has_bulk` (whether the -#' dataset request includes bulk for that sample), `has_cite_seq`, and -#' `has_multiplexed` (all logical) +#' * `samples`: a data frame with one row per included sample and the following columns: +#' - `scpca_sample_id` +#' - `scpca_project_id` +#' - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell) +#' - `has_spatial` +#' - `has_bulk` +#' - `has_cite_seq` +#' - `has_multiplexed` #' * `merged_projects`: a character vector of project IDs whose single-cell #' data is merged; `character(0)` when none #' diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd index c73a65a..d4d9753 100644 --- a/man/get_dataset_info.Rd +++ b/man/get_dataset_info.Rd @@ -21,25 +21,34 @@ a named list with the following elements: \item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") \item \code{status}: the processing status — one of "pending", "processing", "succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}}) -\item \code{n_samples}: the number of rows in \code{samples} (one per sample-modality -combination; merged-single-cell projects are not counted) +\item \code{n_samples}: the total number of samples in the dataset, taken from the +API's \code{total_sample_count} \item \code{n_projects}: the number of projects in the dataset -\item \code{samples}: a data frame with one row per sample-modality combination and -columns \code{scpca_sample_id}, \code{scpca_project_id}, \code{modality} (character: -"single-cell" or "spatial"), and \code{includes_bulk} (logical) +\item \code{samples}: a data frame with one row per included sample and the following columns: +\itemize{ +\item \code{scpca_sample_id} +\item \code{scpca_project_id} +\item \code{seq_unit} ("cell" or "nucleus", or \code{NA} if the sample is not included as single-cell) +\item \code{has_spatial} +\item \code{has_bulk} +\item \code{has_cite_seq} +\item \code{has_multiplexed} +} \item \code{merged_projects}: a character vector of project IDs whose single-cell data is merged; \code{character(0)} when none } } \description{ Fetches a custom dataset and returns a structured summary of its contents, -including its processing status and a per-sample table describing the modality for -each sample. +including its processing status and a per-sample table describing the modality +of each sample. } \details{ -Projects with merged single-cell data (where individual sample IDs are not -enumerated in the dataset record) are excluded from \code{samples} and listed in -\code{merged_projects} instead. +For each project, the included samples and their modality details are looked +up from the project's sample records (one request per project), so merged +projects (whose individual sample IDs are not enumerated in the dataset +record) are expanded to all of their single-cell samples. Projects whose +single-cell data is merged are also listed in \code{merged_projects}. } \examples{ \dontrun{ diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd index 583bba4..73d59d9 100644 --- a/man/make_dataset_data_df.Rd +++ b/man/make_dataset_data_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/datasets.R \name{make_dataset_data_df} \alias{make_dataset_data_df} -\title{Make a per-sample data frame from the \verb{$data} list} +\title{Build the per-sample data frame for a dataset} \usage{ make_dataset_data_df(data) } @@ -10,12 +10,22 @@ make_dataset_data_df(data) \item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}} } \value{ -a data frame with columns \code{scpca_sample_id}, \code{scpca_project_id}, -\code{modality}, and \code{includes_bulk} +a data frame with one row per included sample and columns +\code{scpca_sample_id}, \code{scpca_project_id}, \code{seq_unit} (character: "cell", +"nucleus", or \code{NA}), \code{has_spatial}, \code{has_bulk}, \code{has_cite_seq}, and +\code{has_multiplexed} (all logical) } \description{ -Transforms the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}} into a -one-row-per-sample data frame. Projects with merged single-cell data -(\code{SINGLE_CELL = "MERGED"}) are excluded. +For each project in the dataset \verb{$data} list, fetches the project's sample +metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples the dataset includes: +for a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL}, +and for a merged project, all of the project's single-cell samples. +Each modality is reported only when it is requested for the sample: +\code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the +sample is not included as single-cell), +\code{has_spatial} marks spatial inclusion +\code{has_bulk} reflects the project's \code{includes_bulk} request +intersected with whether the sample actually has bulk data. +\code{has_cite_seq} and \code{has_multiplexed} come from the sample records. } \keyword{internal} From 3c8a7b4c91e41c9c0803e6e588e7d67374bfd76e Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Tue, 9 Jun 2026 21:18:48 -0400 Subject: [PATCH 16/19] re-expand add and remove tests. --- tests/testthat/test-datasets.R | 116 ++++++++++++++++++++++++++++++--- 1 file changed, 107 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index af45212..8adbce9 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -1227,29 +1227,127 @@ test_that("remove_from_dataset_data drops whole projects", { expect_equal(names(result), "SCPCP000001") }) +test_that("remove_from_dataset_data errors when removing a sample from a merged project", { + existing <- list( + SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE) + ) + + expect_error( + remove_from_dataset_data(existing, samples = "SCPCS000001"), + "merged single-cell data" + ) +}) + +test_that("remove_from_dataset_data can drop a merged project wholesale", { + existing <- list( + SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE), + SCPCP000002 = list(SINGLE_CELL = list("SCPCS000003"), SPATIAL = list(), includes_bulk = FALSE) + ) + + result <- remove_from_dataset_data(existing, projects = "SCPCP000001") + expect_equal(names(result), "SCPCP000002") +}) + # add_dataset_samples / remove_dataset_samples tests -test_that("add_dataset_samples PUTs", { +test_that("add_dataset_samples PUTs the merged data", { + captured_req <- NULL local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { - list(id = DATASET_ID, data = list()) + list( + id = DATASET_ID, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list(), + includes_bulk = FALSE + ) + ) + ) }, - build_dataset_data = \(...) list(), - req_perform = \(req, ...) json_response(list(id = DATASET_ID)) + # additions: one sample for the existing project, plus a brand-new project + build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) { + list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000002"), + SPATIAL = list(), + includes_bulk = include_bulk + ), + SCPCP000002 = list( + SINGLE_CELL = list("SCPCS000003"), + SPATIAL = list(), + includes_bulk = include_bulk + ) + ) + }, + req_perform = \(req, ...) { + captured_req <<- req + json_response(req$body$data) + } + ) + + result <- add_dataset_samples( + DATASET_ID, + auth_token = "token", + samples = c("SCPCS000002", "SCPCS000003"), + include_bulk = TRUE ) - result <- add_dataset_samples(DATASET_ID, auth_token = "token", samples = "SCPCS000002") + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) + + sent_data <- captured_req$body$data$data + expect_setequal(names(sent_data), c("SCPCP000001", "SCPCP000002")) + # existing project gains the new sample as a union of old and added IDs + expect_setequal( + as.character(sent_data$SCPCP000001$SINGLE_CELL), + c("SCPCS000001", "SCPCS000002") + ) + # include_bulk applies to the newly added project but not the existing one + expect_false(sent_data$SCPCP000001$includes_bulk) + expect_true(sent_data$SCPCP000002$includes_bulk) expect_equal(result, DATASET_ID) }) -test_that("remove_dataset_samples PUTs", { +test_that("remove_dataset_samples PUTs the reduced data", { + captured_req <- NULL local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { - list(id = DATASET_ID, data = list()) + list( + id = DATASET_ID, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000002 = list( + SINGLE_CELL = list("SCPCS000003"), + SPATIAL = list(), + includes_bulk = FALSE + ) + ) + ) }, - req_perform = \(req, ...) json_response(list(id = DATASET_ID)) + req_perform = \(req, ...) { + captured_req <<- req + json_response(req$body$data) + } ) - result <- remove_dataset_samples(DATASET_ID, auth_token = "token", projects = "SCPCP000002") + result <- remove_dataset_samples( + DATASET_ID, + auth_token = "token", + samples = "SCPCS000002", + projects = "SCPCP000002" + ) + + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) + + # SCPCP000002 dropped wholesale; SCPCP000001 keeps only the un-removed sample + sent_data <- captured_req$body$data$data + expect_equal(names(sent_data), "SCPCP000001") + expect_equal(as.character(sent_data$SCPCP000001$SINGLE_CELL), "SCPCS000001") expect_equal(result, DATASET_ID) }) From 4ca9ef61f1a6d56bec183fc9d4845b7e6139133b Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 11 Jun 2026 10:11:07 -0400 Subject: [PATCH 17/19] Apply suggestions from code review Co-authored-by: Stephanie J. Spielman --- R/datasets.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 766520d..27c2088 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -622,8 +622,8 @@ remove_dataset_samples <- function( #' Build the per-sample data frame for a dataset #' #' For each project in the dataset `$data` list, fetches the project's sample -#' metadata with [get_project_samples()] and keeps only the samples the dataset includes: -#' for a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, +#' metadata with [get_project_samples()] and keeps only the samples that the dataset includes: +#' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, #' and for a merged project, all of the project's single-cell samples. #' Each modality is reported only when it is requested for the sample: #' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the @@ -664,7 +664,7 @@ make_dataset_data_df <- function(data) { project_samples <- get_project_samples(project_id, simplify = FALSE) # Get single cell samples for the project: - # - if merged from the projeect_samples metadata + # - if merged from the project_samples metadata # - if not merged, from the request list. if (merged) { single_cell_ids <- project_samples$scpca_sample_id[ From 477c187b9d18ecbc32a2445df7a81428fbde245b Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 11 Jun 2026 10:24:44 -0400 Subject: [PATCH 18/19] change $samples to $sample_info in get_dataset_info update tests to match (and be more conservative on the slot name) --- R/datasets.R | 18 ++++++++-------- tests/testthat/test-datasets.R | 39 +++++++++++++++++++--------------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 27c2088..69fe444 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -626,12 +626,12 @@ remove_dataset_samples <- function( #' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, #' and for a merged project, all of the project's single-cell samples. #' Each modality is reported only when it is requested for the sample: -#' `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the +#' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the #' sample is not included as single-cell), -#' `has_spatial` marks spatial inclusion -#' `has_bulk` reflects the project's `includes_bulk` request -#' intersected with whether the sample actually has bulk data. -#' `has_cite_seq` and `has_multiplexed` come from the sample records. +#' - `has_spatial` marks spatial inclusion +#' - `has_bulk` reflects the project's `includes_bulk` request +#' intersected with whether the sample actually has bulk data. +#' - `has_cite_seq` and `has_multiplexed` come from the sample records. #' #' @param data the project-keyed `$data` list from [get_dataset_detail()] #' @@ -748,7 +748,7 @@ make_dataset_data_df <- function(data) { #' * `n_samples`: the total number of samples in the dataset, taken from the #' API's `total_sample_count` #' * `n_projects`: the number of projects in the dataset -#' * `samples`: a data frame with one row per included sample and the following columns: +#' * `sample_info`: a data frame with one row per included sample and the following columns: #' - `scpca_sample_id` #' - `scpca_project_id` #' - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell) @@ -767,13 +767,13 @@ make_dataset_data_df <- function(data) { #' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) #' info <- get_dataset_info(ds_id) #' info$status -#' info$samples +#' info$sample_info #' } get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { auth_token <- resolve_auth_token(auth_token) detail <- get_dataset_detail(dataset, auth_token) - samples <- make_dataset_data_df(detail$data) + samples_df <- make_dataset_data_df(detail$data) merged_projects <- detail$data |> purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |> names() |> @@ -786,7 +786,7 @@ get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN" # total_sample_count comes from the API and counts all samples in the dataset. n_samples = detail$total_sample_count, n_projects = length(detail$data), - samples = samples, + sample_info = samples_df, merged_projects = merged_projects ) } diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index 8adbce9..e3c6ef4 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -606,6 +606,7 @@ test_that("get_dataset_info builds a per-sample table from project sample data", ) info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] expect_equal(info$id, DATASET_ID) expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") @@ -614,9 +615,9 @@ test_that("get_dataset_info builds a per-sample table from project sample data", expect_equal(info$n_samples, 4) expect_equal(info$merged_projects, character(0)) expect_null(info$bulk_projects) - expect_s3_class(info$samples, "data.frame") + expect_s3_class(sample_info, "data.frame") expect_setequal( - colnames(info$samples), + colnames(sample_info), c( "scpca_sample_id", "scpca_project_id", @@ -628,10 +629,10 @@ test_that("get_dataset_info builds a per-sample table from project sample data", ) ) # one row per included sample; the unrequested SCPCS000099 is filtered out - expect_equal(nrow(info$samples), 4) - expect_false("SCPCS000099" %in% info$samples$scpca_sample_id) + expect_equal(nrow(sample_info), 4) + expect_false("SCPCS000099" %in% sample_info$scpca_sample_id) - field <- \(col, id) info$samples[[col]][info$samples$scpca_sample_id == id] + field <- \(col, id) sample_info[[col]][sample_info$scpca_sample_id == id] # seq_unit is the single-cell unit, or NA for a spatial-only sample expect_equal(field("seq_unit", "SCPCS000001"), "cell") expect_equal(field("seq_unit", "SCPCS000003"), "cell") @@ -648,7 +649,7 @@ test_that("get_dataset_info builds a per-sample table from project sample data", expect_true(field("has_bulk", "SCPCS000003")) # requested + available expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none - expect_false(any(info$samples$has_multiplexed)) + expect_false(any(sample_info$has_multiplexed)) }) test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", { @@ -683,11 +684,12 @@ test_that("get_dataset_info combines modalities for a sample included as single- ) info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] # one row for the sample: single-cell unit plus spatial - expect_equal(nrow(info$samples), 1) - expect_equal(info$samples$seq_unit, "cell") - expect_true(info$samples$has_spatial) + expect_equal(nrow(sample_info), 1) + expect_equal(sample_info$seq_unit, "cell") + expect_true(sample_info$has_spatial) }) test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", { @@ -704,13 +706,14 @@ test_that("get_dataset_info returns empty samples data frame with correct schema ) info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] expect_equal(info$n_samples, 0) expect_equal(info$n_projects, 0) - expect_equal(nrow(info$samples), 0) + expect_equal(nrow(sample_info), 0) expect_null(info$bulk_projects) expect_setequal( - colnames(info$samples), + colnames(sample_info), c( "scpca_sample_id", "scpca_project_id", @@ -775,18 +778,19 @@ test_that("get_dataset_info expands merged projects to all their single-cell sam ) info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] # merged project still surfaced in merged_projects expect_equal(info$merged_projects, "SCPCP000005") # its single-cell samples are expanded into the table; SCPCS000053 is excluded expect_setequal( - info$samples$scpca_sample_id, + sample_info$scpca_sample_id, c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052") ) - expect_false("SCPCS000053" %in% info$samples$scpca_sample_id) + expect_false("SCPCS000053" %in% sample_info$scpca_sample_id) # the nucleus seq_unit is reported for that sample expect_equal( - info$samples$seq_unit[info$samples$scpca_sample_id == "SCPCS000052"], + sample_info$seq_unit[sample_info$scpca_sample_id == "SCPCS000052"], "nucleus" ) expect_equal(info$n_projects, 2) @@ -853,10 +857,11 @@ test_that("get_dataset_info prunes projects where nothing is requested", { ) info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] - expect_equal(nrow(info$samples), 1) - expect_equal(info$samples$scpca_project_id, "SCPCP000001") - expect_false("SCPCP000002" %in% info$samples$scpca_project_id) + expect_equal(nrow(sample_info), 1) + expect_equal(sample_info$scpca_project_id, "SCPCP000001") + expect_false("SCPCP000002" %in% sample_info$scpca_project_id) }) test_that("get_dataset_info errors when auth_token is empty", { From 47048423f0ecf5f8f9e3224e3e8e0dfa7107de97 Mon Sep 17 00:00:00 2001 From: Joshua Shapiro Date: Thu, 11 Jun 2026 10:32:18 -0400 Subject: [PATCH 19/19] docs updates --- R/datasets.R | 13 +++++++------ man/get_dataset_info.Rd | 4 ++-- man/make_dataset_data_df.Rd | 21 ++++++++++++--------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 69fe444..e4a7c9b 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -625,13 +625,14 @@ remove_dataset_samples <- function( #' metadata with [get_project_samples()] and keeps only the samples that the dataset includes: #' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, #' and for a merged project, all of the project's single-cell samples. -#' Each modality is reported only when it is requested for the sample: +#' Each modality flag is reported only as TRUE for samples that are both included in the dataset +#' and actually have that modality available: #' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the -#' sample is not included as single-cell), -#' - `has_spatial` marks spatial inclusion -#' - `has_bulk` reflects the project's `includes_bulk` request -#' intersected with whether the sample actually has bulk data. -#' - `has_cite_seq` and `has_multiplexed` come from the sample records. +#' sample is not included as single-cell) +#' - `has_spatial` marks spatial inclusion, if requested, for the sample or project +#' - `has_bulk` indicates that the sample is present in the bulk data table, if requested for a project. +#' - `has_cite_seq` and `has_multiplexed` come from the sample records +#' and do not depend on the specific request #' #' @param data the project-keyed `$data` list from [get_dataset_detail()] #' diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd index d4d9753..c032595 100644 --- a/man/get_dataset_info.Rd +++ b/man/get_dataset_info.Rd @@ -24,7 +24,7 @@ a named list with the following elements: \item \code{n_samples}: the total number of samples in the dataset, taken from the API's \code{total_sample_count} \item \code{n_projects}: the number of projects in the dataset -\item \code{samples}: a data frame with one row per included sample and the following columns: +\item \code{sample_info}: a data frame with one row per included sample and the following columns: \itemize{ \item \code{scpca_sample_id} \item \code{scpca_project_id} @@ -55,6 +55,6 @@ single-cell data is merged are also listed in \code{merged_projects}. ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) info <- get_dataset_info(ds_id) info$status -info$samples +info$sample_info } } diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd index 73d59d9..4c55760 100644 --- a/man/make_dataset_data_df.Rd +++ b/man/make_dataset_data_df.Rd @@ -17,15 +17,18 @@ a data frame with one row per included sample and columns } \description{ For each project in the dataset \verb{$data} list, fetches the project's sample -metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples the dataset includes: -for a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL}, +metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples that the dataset includes: +For a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL}, and for a merged project, all of the project's single-cell samples. -Each modality is reported only when it is requested for the sample: -\code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the -sample is not included as single-cell), -\code{has_spatial} marks spatial inclusion -\code{has_bulk} reflects the project's \code{includes_bulk} request -intersected with whether the sample actually has bulk data. -\code{has_cite_seq} and \code{has_multiplexed} come from the sample records. +Each modality flag is reported only as TRUE for samples that are both included in the dataset +and actually have that modality available: +\itemize{ +\item \code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the +sample is not included as single-cell) +\item \code{has_spatial} marks spatial inclusion, if requested, for the sample or project +\item \code{has_bulk} indicates that the sample is present in the bulk data table, if requested for a project. +\item \code{has_cite_seq} and \code{has_multiplexed} come from the sample records +and do not depend on the specific request +} } \keyword{internal}