diff --git a/.Rbuildignore b/.Rbuildignore index d3bd575..a90605e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,6 +2,7 @@ ^LICENSE\.md$ ^ScPCAr\.Rproj$ ^\.Rproj\.user$ +^\.claude$ ^\.github$ ^\.pre-commit-config\.yaml$ ^_pkgdown\.yml$ diff --git a/NAMESPACE b/NAMESPACE index 28593d4..66eb4ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(download_dataset) export(download_project) export(download_sample) export(get_auth) +export(get_dataset_info) export(get_dataset_status) export(get_project_info) export(get_project_libraries) diff --git a/R/datasets.R b/R/datasets.R index db66829..e4a7c9b 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -47,15 +47,16 @@ build_dataset_data <- function(samples = NULL, projects = NULL, include_bulk = F #' Resolve a dataset identifier to its ID string #' -#' Accepts either a dataset UUID string or a list with an `$id` element (such as -#' the return value of [create_dataset()] or [get_dataset_detail()]) and returns -#' the ID string, after checking that it is a valid UUID. +#' Accepts either a dataset UUID string (such as the value returned by +#' [create_dataset()]) or a list with an `$id` element (such as the value returned +#' by [get_dataset_detail()]) and returns the ID string, after checking that it is +#' a valid UUID. #' #' @param dataset a dataset UUID string, or a list with an `$id` element #' #' @keywords internal #' -#' @returns the dataset ID as a length-1 character string +#' @returns the dataset ID as a character string resolve_dataset_id <- function(dataset) { if (is.list(dataset)) { stopifnot("dataset must be an id string or contain an $id element" = !is.null(dataset$id)) @@ -112,7 +113,9 @@ update_dataset <- function(dataset_id, body, auth_token) { #' Create a custom dataset on the ScPCA Portal #' #' Creates a new user dataset without starting processing. -#' The returned list includes the dataset `$id` along with its current contents and status. +#' Returns the new dataset's ID (invisibly), which you can pass to the other +#' dataset functions such as [get_dataset_info()], [add_dataset_samples()], and +#' [start_dataset_processing()]. #' #' @param samples optional character vector of ScPCA sample IDs (e.g. "SCPCS000001") #' @param projects optional character vector of ScPCA project IDs (e.g. "SCPCP000001"); @@ -125,7 +128,7 @@ update_dataset <- function(dataset_id, body, auth_token) { #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the API response as a list (invisibly), including the dataset `$id` +#' @returns the dataset ID as a character string (invisibly) #' #' @import httr2 #' @export @@ -133,11 +136,11 @@ update_dataset <- function(dataset_id, body, auth_token) { #' @examples #' \dontrun{ #' token <- get_auth("user@example.com", agree = TRUE) -#' ds <- create_dataset( +#' ds_id <- create_dataset( #' auth_token = token, #' samples = c("SCPCS000001", "SCPCS000002") #' ) -#' ds$id +#' ds_id #' } create_dataset <- function( samples = NULL, @@ -179,7 +182,7 @@ create_dataset <- function( resp_body_json() message(glue::glue("ScPCA dataset {response$id} created.")) - invisible(response) + invisible(response$id) } @@ -193,8 +196,9 @@ create_dataset <- function( #' it is also used by the dataset modification functions to fetch current #' contents before updating. #' -#' @param dataset the dataset UUID string, or a list with an `$id` element -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param auth_token an authorization token obtained from [get_auth()]; #' must match the token used to create the dataset. #' @@ -218,6 +222,27 @@ get_dataset_detail <- function(dataset, auth_token) { resp_body_json() } +#' Map dataset detail status flags to a status string +#' +#' @param detail the dataset detail list returned by [get_dataset_detail()] +#' +#' @keywords internal +#' +#' @returns a single character string: one of "pending", "processing", +#' "succeeded", "failed", or "expired" +dataset_status_from_detail <- function(detail) { + if (isTRUE(detail$is_failed)) { + "failed" + } else if (isTRUE(detail$is_expired)) { + "expired" + } else if (isTRUE(detail$is_succeeded)) { + "succeeded" + } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) { + "processing" + } else { + "pending" + } +} #' Get the processing status of a custom dataset #' @@ -233,8 +258,9 @@ get_dataset_detail <- function(dataset, auth_token) { #' expired and must be regenerated #' * `"failed"`: processing failed #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' @@ -246,22 +272,12 @@ get_dataset_detail <- function(dataset, auth_token) { #' #' @examples #' \dontrun{ -#' get_dataset_status(ds) +#' get_dataset_status(ds_id) #' } get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { auth_token <- resolve_auth_token(auth_token) detail <- get_dataset_detail(dataset, auth_token) - if (isTRUE(detail$is_failed)) { - "failed" - } else if (isTRUE(detail$is_expired)) { - "expired" - } else if (isTRUE(detail$is_succeeded)) { - "succeeded" - } else if (isTRUE(detail$is_processing) || isTRUE(detail$is_started)) { - "processing" - } else { - "pending" - } + dataset_status_from_detail(detail) } @@ -283,13 +299,13 @@ get_dataset_status <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKE #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @export #' #' @examples #' \dontrun{ -#' replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002")) +#' replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002")) #' } replace_dataset_data <- function( dataset, @@ -308,8 +324,8 @@ replace_dataset_data <- function( data <- build_dataset_data(samples = samples, projects = projects, include_bulk = include_bulk) - response <- update_dataset(dataset_id, list(data = data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = data), auth_token = auth_token) + invisible(dataset_id) } @@ -326,13 +342,13 @@ replace_dataset_data <- function( #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @export #' #' @examples #' \dontrun{ -#' set_dataset_email(ds, email = "user@example.com") +#' set_dataset_email(ds_id, email = "user@example.com") #' } set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { auth_token <- resolve_auth_token(auth_token) @@ -343,8 +359,8 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT ) dataset_id <- resolve_dataset_id(dataset) - response <- update_dataset(dataset_id, list(email = email), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(email = email), auth_token = auth_token) + invisible(dataset_id) } @@ -362,24 +378,23 @@ set_dataset_email <- function(dataset, email, auth_token = Sys.getenv("SCPCA_AUT #' * A `"processing"` or `"succeeded"` dataset is already underway or done; #' a message is emitted and no request is sent. #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param email optional email address for the download notification. When #' supplied, it is set as part of the same request that starts processing. #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) when a request is -#' sent, or `NULL` (invisibly) when the dataset is already processing or -#' completed. +#' @returns the dataset ID as a character string (invisibly) #' #' @import httr2 #' @export #' #' @examples #' \dontrun{ -#' ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) -#' start_dataset_processing(ds, email = "user@example.com") +#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +#' start_dataset_processing(ds_id, email = "user@example.com") #' } start_dataset_processing <- function( dataset, @@ -401,11 +416,11 @@ start_dataset_processing <- function( # continue without message for "pending" or "expired" if (status == "processing") { message(glue::glue("ScPCA dataset {dataset_id} is already processing.")) - return(invisible(NULL)) + return(invisible(dataset_id)) } if (status == "succeeded") { message(glue::glue("ScPCA dataset {dataset_id} has already completed processing.")) - return(invisible(NULL)) + return(invisible(dataset_id)) } if (status == "failed") { warning( @@ -419,9 +434,9 @@ start_dataset_processing <- function( body$email <- email } - response <- update_dataset(dataset_id, body, auth_token = auth_token) + update_dataset(dataset_id, body, auth_token = auth_token) message(glue::glue("ScPCA dataset {dataset_id} processing started.")) - invisible(response) + invisible(dataset_id) } @@ -540,7 +555,7 @@ remove_from_dataset_data <- function(existing, samples = NULL, projects = NULL) #' @param auth_token an authorization token from [get_auth()]. Defaults to the #' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. #' -#' @returns the updated dataset detail as a list (invisibly) +#' @returns the dataset ID as a character string (invisibly) #' #' @rdname modify_dataset_samples #' @export @@ -576,8 +591,8 @@ add_dataset_samples <- function( ) new_data <- merge_dataset_data(current$data, additions, include_bulk = include_bulk) - response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) + invisible(dataset_id) } @@ -599,8 +614,182 @@ remove_dataset_samples <- function( current <- get_dataset_detail(dataset_id, auth_token = auth_token) new_data <- remove_from_dataset_data(current$data, samples = samples, projects = projects) - response <- update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) - invisible(response) + update_dataset(dataset_id, list(data = new_data), auth_token = auth_token) + invisible(dataset_id) +} + + +#' Build the per-sample data frame for a dataset +#' +#' For each project in the dataset `$data` list, fetches the project's sample +#' metadata with [get_project_samples()] and keeps only the samples that the dataset includes: +#' For a "regular" project the IDs listed under `SINGLE_CELL`/`SPATIAL`, +#' and for a merged project, all of the project's single-cell samples. +#' Each modality flag is reported only as TRUE for samples that are both included in the dataset +#' and actually have that modality available: +#' - `seq_unit` gives the single-cell sequencing unit ("cell" or "nucleus", or `NA` when the +#' sample is not included as single-cell) +#' - `has_spatial` marks spatial inclusion, if requested, for the sample or project +#' - `has_bulk` indicates that the sample is present in the bulk data table, if requested for a project. +#' - `has_cite_seq` and `has_multiplexed` come from the sample records +#' and do not depend on the specific request +#' +#' @param data the project-keyed `$data` list from [get_dataset_detail()] +#' +#' @keywords internal +#' @importFrom dplyr .data +#' +#' @returns a data frame with one row per included sample and columns +#' `scpca_sample_id`, `scpca_project_id`, `seq_unit` (character: "cell", +#' "nucleus", or `NA`), `has_spatial`, `has_bulk`, `has_cite_seq`, and +#' `has_multiplexed` (all logical) +make_dataset_data_df <- function(data) { + empty <- tibble::tibble( + scpca_sample_id = character(), + scpca_project_id = character(), + seq_unit = character(), + has_spatial = logical(), + has_bulk = logical(), + has_cite_seq = logical(), + has_multiplexed = logical() + ) + if (length(data) == 0) { + return(empty) + } + + result <- data |> + purrr::imap(\(project, project_id) { + merged <- identical(project$SINGLE_CELL, "MERGED") + + # The project's sample metadata has the modality details we will need. + project_samples <- get_project_samples(project_id, simplify = FALSE) + + # Get single cell samples for the project: + # - if merged from the project_samples metadata + # - if not merged, from the request list. + if (merged) { + single_cell_ids <- project_samples$scpca_sample_id[ + project_samples$has_single_cell_data + ] + } else { + single_cell_ids <- as.character(project$SINGLE_CELL) + } + + spatial_ids <- as.character(project$SPATIAL) + included_ids <- union(single_cell_ids, spatial_ids) + requested_bulk <- isTRUE(project$includes_bulk) + + project_samples |> + # keep only the samples the dataset requests for this project + dplyr::filter(.data$scpca_sample_id %in% included_ids) |> + dplyr::mutate( + scpca_project_id = project_id, + # the single-cell sequencing unit (cell or nucleus), or NA when + # single-cell is not requested for the sample + seq_unit = purrr::map2_chr( + .data$seq_units, + .data$scpca_sample_id, + \(units, sample_id) { + if (!sample_id %in% single_cell_ids) { + return(NA_character_) + } + # get only the nucleus or cell (not spot or bulk) + # if both are present (unlikely), combine with a comma + intersect(c("cell", "nucleus"), as.character(units)) |> + paste(collapse = ",") + } + ), + # only modalities requested for the sample are reported; has_bulk also + # requires the sample to actually have bulk data + has_spatial = .data$scpca_sample_id %in% spatial_ids, + has_bulk = requested_bulk & .data$has_bulk_rna_seq, + has_cite_seq = .data$has_cite_seq_data, + has_multiplexed = .data$has_multiplexed_data + ) |> + dplyr::select( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" + ) + }) |> + purrr::list_rbind() |> + dplyr::arrange(.data$scpca_sample_id) + + if (nrow(result) == 0) empty else result +} + + +#' Get a summary of a custom ScPCA dataset +#' +#' Fetches a custom dataset and returns a structured summary of its contents, +#' including its processing status and a per-sample table describing the modality +#' of each sample. +#' +#' For each project, the included samples and their modality details are looked +#' up from the project's sample records (one request per project), so merged +#' projects (whose individual sample IDs are not enumerated in the dataset +#' record) are expanded to all of their single-cell samples. Projects whose +#' single-cell data is merged are also listed in `merged_projects`. +#' +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by this function). +#' @param auth_token an authorization token from [get_auth()]. Defaults to the +#' `SCPCA_AUTH_TOKEN` environment variable, which [get_auth()] sets automatically. +#' +#' @returns a named list with the following elements: +#' * `id`: the dataset UUID string +#' * `format`: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") +#' * `status`: the processing status — one of "pending", "processing", +#' "succeeded", "failed", or "expired" (see [get_dataset_status()]) +#' * `n_samples`: the total number of samples in the dataset, taken from the +#' API's `total_sample_count` +#' * `n_projects`: the number of projects in the dataset +#' * `sample_info`: a data frame with one row per included sample and the following columns: +#' - `scpca_sample_id` +#' - `scpca_project_id` +#' - `seq_unit` ("cell" or "nucleus", or `NA` if the sample is not included as single-cell) +#' - `has_spatial` +#' - `has_bulk` +#' - `has_cite_seq` +#' - `has_multiplexed` +#' * `merged_projects`: a character vector of project IDs whose single-cell +#' data is merged; `character(0)` when none +#' +#' @import httr2 +#' @export +#' +#' @examples +#' \dontrun{ +#' ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +#' info <- get_dataset_info(ds_id) +#' info$status +#' info$sample_info +#' } +get_dataset_info <- function(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) { + auth_token <- resolve_auth_token(auth_token) + detail <- get_dataset_detail(dataset, auth_token) + + samples_df <- make_dataset_data_df(detail$data) + merged_projects <- detail$data |> + purrr::keep(\(p) identical(p$SINGLE_CELL, "MERGED")) |> + names() |> + as.character() + + list( + id = detail$id, + format = detail$format, + status = dataset_status_from_detail(detail), + # total_sample_count comes from the API and counts all samples in the dataset. + n_samples = detail$total_sample_count, + n_projects = length(detail$data), + sample_info = samples_df, + merged_projects = merged_projects + ) } diff --git a/R/downloads.R b/R/downloads.R index 1a865b6..e7171a7 100644 --- a/R/downloads.R +++ b/R/downloads.R @@ -433,8 +433,9 @@ parse_download_file <- function(scpca_url) { #' from the dataset's download filename (which includes the dataset ID, format, #' and date). #' -#' @param dataset the dataset UUID string, or a list with an `$id` element, -#' such as the return value of [create_dataset()]. +#' @param dataset the dataset UUID string (such as the value returned by +#' [create_dataset()]), or a list with an `$id` element (such as the value +#' returned by [get_dataset_detail()]). #' @param destination The path to the directory where the unzipped file directory #' should be saved. Default is "scpca_data". #' @param overwrite Whether to overwrite files in existing directories if they diff --git a/_pkgdown.yml b/_pkgdown.yml index 73870af..986e2a4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,7 @@ reference: contents: - create_dataset - get_dataset_status + - get_dataset_info - download_dataset - add_dataset_samples - replace_dataset_data diff --git a/man/await_dataset_processing.Rd b/man/await_dataset_processing.Rd index 1109953..599a465 100644 --- a/man/await_dataset_processing.Rd +++ b/man/await_dataset_processing.Rd @@ -13,8 +13,9 @@ await_dataset_processing( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{poll_interval}{Number of minutes to wait between status checks when \code{await_processing = TRUE}. Default is 0.5 (30 seconds).} diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index 606c502..55f2344 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -31,19 +31,21 @@ spatial samples are always returned in Space Ranger format.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the API response as a list (invisibly), including the dataset \verb{$id} +the dataset ID as a character string (invisibly) } \description{ Creates a new user dataset without starting processing. -The returned list includes the dataset \verb{$id} along with its current contents and status. +Returns the new dataset's ID (invisibly), which you can pass to the other +dataset functions such as \code{\link[=get_dataset_info]{get_dataset_info()}}, \code{\link[=add_dataset_samples]{add_dataset_samples()}}, and +\code{\link[=start_dataset_processing]{start_dataset_processing()}}. } \examples{ \dontrun{ token <- get_auth("user@example.com", agree = TRUE) -ds <- create_dataset( +ds_id <- create_dataset( auth_token = token, samples = c("SCPCS000001", "SCPCS000002") ) -ds$id +ds_id } } diff --git a/man/dataset_status_from_detail.Rd b/man/dataset_status_from_detail.Rd new file mode 100644 index 0000000..1c71ea0 --- /dev/null +++ b/man/dataset_status_from_detail.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{dataset_status_from_detail} +\alias{dataset_status_from_detail} +\title{Map dataset detail status flags to a status string} +\usage{ +dataset_status_from_detail(detail) +} +\arguments{ +\item{detail}{the dataset detail list returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}} +} +\value{ +a single character string: one of "pending", "processing", +"succeeded", "failed", or "expired" +} +\description{ +Map dataset detail status flags to a status string +} +\keyword{internal} diff --git a/man/download_dataset.Rd b/man/download_dataset.Rd index 39bb6cd..a996509 100644 --- a/man/download_dataset.Rd +++ b/man/download_dataset.Rd @@ -18,8 +18,9 @@ download_dataset( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{destination}{The path to the directory where the unzipped file directory should be saved. Default is "scpca_data".} diff --git a/man/get_dataset_detail.Rd b/man/get_dataset_detail.Rd index 2005b35..4f56d9d 100644 --- a/man/get_dataset_detail.Rd +++ b/man/get_dataset_detail.Rd @@ -7,8 +7,9 @@ get_dataset_detail(dataset, auth_token) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{auth_token}{an authorization token obtained from \code{\link[=get_auth]{get_auth()}}; must match the token used to create the dataset.} diff --git a/man/get_dataset_info.Rd b/man/get_dataset_info.Rd new file mode 100644 index 0000000..c032595 --- /dev/null +++ b/man/get_dataset_info.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{get_dataset_info} +\alias{get_dataset_info} +\title{Get a summary of a custom ScPCA dataset} +\usage{ +get_dataset_info(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) +} +\arguments{ +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by this function).} + +\item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the +\code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} +} +\value{ +a named list with the following elements: +\itemize{ +\item \code{id}: the dataset UUID string +\item \code{format}: the dataset file format (e.g. "SINGLE_CELL_EXPERIMENT", "ANN_DATA") +\item \code{status}: the processing status — one of "pending", "processing", +"succeeded", "failed", or "expired" (see \code{\link[=get_dataset_status]{get_dataset_status()}}) +\item \code{n_samples}: the total number of samples in the dataset, taken from the +API's \code{total_sample_count} +\item \code{n_projects}: the number of projects in the dataset +\item \code{sample_info}: a data frame with one row per included sample and the following columns: +\itemize{ +\item \code{scpca_sample_id} +\item \code{scpca_project_id} +\item \code{seq_unit} ("cell" or "nucleus", or \code{NA} if the sample is not included as single-cell) +\item \code{has_spatial} +\item \code{has_bulk} +\item \code{has_cite_seq} +\item \code{has_multiplexed} +} +\item \code{merged_projects}: a character vector of project IDs whose single-cell +data is merged; \code{character(0)} when none +} +} +\description{ +Fetches a custom dataset and returns a structured summary of its contents, +including its processing status and a per-sample table describing the modality +of each sample. +} +\details{ +For each project, the included samples and their modality details are looked +up from the project's sample records (one request per project), so merged +projects (whose individual sample IDs are not enumerated in the dataset +record) are expanded to all of their single-cell samples. Projects whose +single-cell data is merged are also listed in \code{merged_projects}. +} +\examples{ +\dontrun{ +ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +info <- get_dataset_info(ds_id) +info$status +info$sample_info +} +} diff --git a/man/get_dataset_status.Rd b/man/get_dataset_status.Rd index 4a56ef8..3f60353 100644 --- a/man/get_dataset_status.Rd +++ b/man/get_dataset_status.Rd @@ -7,8 +7,9 @@ get_dataset_status(dataset, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{auth_token}{an authorization token from \code{\link[=get_auth]{get_auth()}}. Defaults to the \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} @@ -19,21 +20,21 @@ a single character string: one of "pending", "processing", } \description{ Returns a single string describing where a dataset is in the processing -lifecycle, by fetching the dataset detail and translating its status fields -(\code{is_started}, \code{is_succeeded}, \code{is_failed}). A dataset that has been started -but has neither succeeded nor failed is reported as "processing". +lifecycle. } \details{ Possible values are: -\describe{ -\item{"pending"}{the dataset has not been started} -\item{"processing"}{the dataset has been started but is not yet finished} -\item{"succeeded"}{processing finished and the dataset is ready to download} -\item{"failed"}{processing failed} +\itemize{ +\item \code{"pending"}: the dataset has not been started +\item \code{"processing"}: the dataset has been started but is not yet finished +\item \code{"succeeded"}: processing finished and the dataset is ready to download +\item \code{"expired"}: processing completed but the generated download has since +expired and must be regenerated +\item \code{"failed"}: processing failed } } \examples{ \dontrun{ -get_dataset_status(ds) +get_dataset_status(ds_id) } } diff --git a/man/make_dataset_data_df.Rd b/man/make_dataset_data_df.Rd new file mode 100644 index 0000000..4c55760 --- /dev/null +++ b/man/make_dataset_data_df.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{make_dataset_data_df} +\alias{make_dataset_data_df} +\title{Build the per-sample data frame for a dataset} +\usage{ +make_dataset_data_df(data) +} +\arguments{ +\item{data}{the project-keyed \verb{$data} list from \code{\link[=get_dataset_detail]{get_dataset_detail()}}} +} +\value{ +a data frame with one row per included sample and columns +\code{scpca_sample_id}, \code{scpca_project_id}, \code{seq_unit} (character: "cell", +"nucleus", or \code{NA}), \code{has_spatial}, \code{has_bulk}, \code{has_cite_seq}, and +\code{has_multiplexed} (all logical) +} +\description{ +For each project in the dataset \verb{$data} list, fetches the project's sample +metadata with \code{\link[=get_project_samples]{get_project_samples()}} and keeps only the samples that the dataset includes: +For a "regular" project the IDs listed under \code{SINGLE_CELL}/\code{SPATIAL}, +and for a merged project, all of the project's single-cell samples. +Each modality flag is reported only as TRUE for samples that are both included in the dataset +and actually have that modality available: +\itemize{ +\item \code{seq_unit} gives the single-cell sequencing unit ("cell" or "nucleus", or \code{NA} when the +sample is not included as single-cell) +\item \code{has_spatial} marks spatial inclusion, if requested, for the sample or project +\item \code{has_bulk} indicates that the sample is present in the bulk data table, if requested for a project. +\item \code{has_cite_seq} and \code{has_multiplexed} come from the sample records +and do not depend on the specific request +} +} +\keyword{internal} diff --git a/man/modify_dataset_samples.Rd b/man/modify_dataset_samples.Rd index 61604ec..6cfc56d 100644 --- a/man/modify_dataset_samples.Rd +++ b/man/modify_dataset_samples.Rd @@ -36,7 +36,7 @@ projects keep their current value. Default is FALSE.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ \code{add_dataset_samples()} adds the given samples and/or all samples from the diff --git a/man/replace_dataset_data.Rd b/man/replace_dataset_data.Rd index 15afdf5..4b88c6e 100644 --- a/man/replace_dataset_data.Rd +++ b/man/replace_dataset_data.Rd @@ -26,7 +26,7 @@ all samples from each project are included.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Replaces the samples and/or projects in an existing dataset with a new @@ -40,6 +40,6 @@ A dataset that has already started processing cannot be updated. } \examples{ \dontrun{ -replace_dataset_data(ds, samples = c("SCPCS000001", "SCPCS000002")) +replace_dataset_data(ds_id, samples = c("SCPCS000001", "SCPCS000002")) } } diff --git a/man/resolve_dataset_id.Rd b/man/resolve_dataset_id.Rd index ee7e54c..2588243 100644 --- a/man/resolve_dataset_id.Rd +++ b/man/resolve_dataset_id.Rd @@ -10,11 +10,12 @@ resolve_dataset_id(dataset) \item{dataset}{a dataset UUID string, or a list with an \verb{$id} element} } \value{ -the dataset ID as a length-1 character string +the dataset ID as a character string } \description{ -Accepts either a dataset UUID string or a list with an \verb{$id} element (such as -the return value of \code{\link[=create_dataset]{create_dataset()}} or \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns -the ID string, after checking that it is a valid UUID. +Accepts either a dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}) or a list with an \verb{$id} element (such as the value returned +by \code{\link[=get_dataset_detail]{get_dataset_detail()}}) and returns the ID string, after checking that it is +a valid UUID. } \keyword{internal} diff --git a/man/set_dataset_email.Rd b/man/set_dataset_email.Rd index cc28f59..a245d01 100644 --- a/man/set_dataset_email.Rd +++ b/man/set_dataset_email.Rd @@ -15,7 +15,7 @@ set_dataset_email(dataset, email, auth_token = Sys.getenv("SCPCA_AUTH_TOKEN")) \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Updates the email address the ScPCA Portal will use to notify you when the @@ -27,6 +27,6 @@ A dataset that has already been started cannot be modified. } \examples{ \dontrun{ -set_dataset_email(ds, email = "user@example.com") +set_dataset_email(ds_id, email = "user@example.com") } } diff --git a/man/start_dataset_processing.Rd b/man/start_dataset_processing.Rd index b61909e..fd7a61b 100644 --- a/man/start_dataset_processing.Rd +++ b/man/start_dataset_processing.Rd @@ -11,8 +11,9 @@ start_dataset_processing( ) } \arguments{ -\item{dataset}{the dataset UUID string, or a list with an \verb{$id} element, -such as the return value of \code{\link[=create_dataset]{create_dataset()}}.} +\item{dataset}{the dataset UUID string (such as the value returned by +\code{\link[=create_dataset]{create_dataset()}}), or a list with an \verb{$id} element (such as the value +returned by \code{\link[=get_dataset_detail]{get_dataset_detail()}}).} \item{email}{optional email address for the download notification. When supplied, it is set as part of the same request that starts processing.} @@ -21,7 +22,7 @@ supplied, it is set as part of the same request that starts processing.} \code{SCPCA_AUTH_TOKEN} environment variable, which \code{\link[=get_auth]{get_auth()}} sets automatically.} } \value{ -the updated dataset detail as a list (invisibly) +the dataset ID as a character string (invisibly) } \description{ Starts processing of an existing custom dataset so that its files can be @@ -29,12 +30,18 @@ built for download, by sending a PUT request that sets \code{start = TRUE}. Optionally sets the notification email as part of the same request. } \details{ -Once processing has started a dataset is locked and can no longer be -modified; attempting to modify or re-start it will raise an error. +Before sending the request the current dataset status is checked via +\code{\link[=get_dataset_status]{get_dataset_status()}}: +\itemize{ +\item A \code{"pending"} or \code{"expired"} dataset is started normally. +\item A \code{"failed"} dataset is retried with a warning. +\item A \code{"processing"} or \code{"succeeded"} dataset is already underway or done; +a message is emitted and no request is sent. +} } \examples{ \dontrun{ -ds <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) -start_dataset_processing(ds, email = "user@example.com") +ds_id <- create_dataset(samples = c("SCPCS000001", "SCPCS000002")) +start_dataset_processing(ds_id, email = "user@example.com") } } diff --git a/tests/testthat/test-datasets.R b/tests/testthat/test-datasets.R index f0d9857..e3c6ef4 100644 --- a/tests/testthat/test-datasets.R +++ b/tests/testthat/test-datasets.R @@ -267,6 +267,7 @@ test_that("create_dataset errors when spatial format is requested", { }) test_that("create_dataset POSTs with start = FALSE", { + captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) { list( @@ -278,8 +279,8 @@ test_that("create_dataset POSTs with start = FALSE", { ) }, req_perform = \(req, ...) { - body <- req$body$data - json_response(c(body, list(id = "new-dataset-uuid"))) + captured_req <<- req + json_response(list(id = "new-dataset-uuid")) } ) @@ -290,10 +291,12 @@ test_that("create_dataset POSTs with start = FALSE", { }, "new-dataset-uuid" ) - expect_false(result$start) + expect_equal(httr2::req_get_method(captured_req), "POST") + expect_false(captured_req$body$data$start) + expect_equal(result, "new-dataset-uuid") }) -test_that("create_dataset returns response invisibly and messages with dataset id", { +test_that("create_dataset returns the dataset id invisibly and messages with dataset id", { local_mocked_bindings( build_dataset_data = \(...) { list( @@ -319,24 +322,24 @@ test_that("create_dataset returns response invisibly and messages with dataset i }, "new-dataset-uuid" ) - expect_equal(result$id, "new-dataset-uuid") + expect_equal(result, "new-dataset-uuid") }) test_that("create_dataset reads auth_token from the SCPCA_AUTH_TOKEN environment variable", { withr::local_envvar(SCPCA_AUTH_TOKEN = "env-token") + captured_req <- NULL local_mocked_bindings( build_dataset_data = \(...) list(), req_perform = \(req, ...) { - json_response(list( - id = "new-dataset-uuid", - api_key = httr2::req_get_headers(req, "reveal")$`api-key` - )) + captured_req <<- req + json_response(list(id = "new-dataset-uuid")) } ) # called without auth_token; the token should come from the environment result <- suppressMessages(create_dataset(samples = "SCPCS000001", format = "sce")) - expect_equal(result$api_key, "env-token") + expect_equal(httr2::req_get_headers(captured_req, "reveal")$`api-key`, "env-token") + expect_equal(result, "new-dataset-uuid") }) # get_dataset_detail tests @@ -549,6 +552,326 @@ test_that("get_dataset_status errors when auth_token is empty", { }) +# get_dataset_info tests + +test_that("get_dataset_info builds a per-sample table from project sample data", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + is_succeeded = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000002 = list( + SINGLE_CELL = list("SCPCS000003"), + SPATIAL = list("SCPCS000004"), + includes_bulk = TRUE + ) + ), + total_sample_count = 4 + ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + if (project_id == "SCPCP000001") { + # SCPCS000099 belongs to the project but is not in the dataset request + tibble::tibble( + scpca_sample_id = c("SCPCS000001", "SCPCS000002", "SCPCS000099"), + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell", "cell", "cell") + ) + } else { + tibble::tibble( + scpca_sample_id = c("SCPCS000003", "SCPCS000004"), + scpca_project_id = project_id, + has_single_cell_data = c(TRUE, FALSE), + has_spatial_data = c(FALSE, TRUE), + has_bulk_rna_seq = c(TRUE, FALSE), + has_cite_seq_data = c(TRUE, FALSE), + has_multiplexed_data = c(FALSE, FALSE), + seq_units = list(c("cell", "bulk"), "spot") + ) + } + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] + + expect_equal(info$id, DATASET_ID) + expect_equal(info$format, "SINGLE_CELL_EXPERIMENT") + expect_equal(info$status, "pending") + expect_equal(info$n_projects, 2) + expect_equal(info$n_samples, 4) + expect_equal(info$merged_projects, character(0)) + expect_null(info$bulk_projects) + expect_s3_class(sample_info, "data.frame") + expect_setequal( + colnames(sample_info), + c( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" + ) + ) + # one row per included sample; the unrequested SCPCS000099 is filtered out + expect_equal(nrow(sample_info), 4) + expect_false("SCPCS000099" %in% sample_info$scpca_sample_id) + + field <- \(col, id) sample_info[[col]][sample_info$scpca_sample_id == id] + # seq_unit is the single-cell unit, or NA for a spatial-only sample + expect_equal(field("seq_unit", "SCPCS000001"), "cell") + expect_equal(field("seq_unit", "SCPCS000003"), "cell") + expect_true(is.na(field("seq_unit", "SCPCS000004"))) + + # only requested modalities are reported + expect_true(field("has_spatial", "SCPCS000004")) + expect_false(field("has_spatial", "SCPCS000001")) + + expect_true(field("has_cite_seq", "SCPCS000003")) + expect_false(field("has_cite_seq", "SCPCS000001")) + + # has_bulk reflects the request AND availability + expect_true(field("has_bulk", "SCPCS000003")) # requested + available + expect_false(field("has_bulk", "SCPCS000001")) # project did not request bulk + expect_false(field("has_bulk", "SCPCS000004")) # requested but sample has none + expect_false(any(sample_info$has_multiplexed)) +}) + +test_that("get_dataset_info combines modalities for a sample included as single-cell and spatial", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list("SCPCS000001"), + includes_bulk = FALSE + ) + ), + total_sample_count = 1 + ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = TRUE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list(c("cell", "spot")) + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] + + # one row for the sample: single-cell unit plus spatial + expect_equal(nrow(sample_info), 1) + expect_equal(sample_info$seq_unit, "cell") + expect_true(sample_info$has_spatial) +}) + +test_that("get_dataset_info returns empty samples data frame with correct schema for empty dataset", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "ANN_DATA", + is_started = FALSE, + data = list(), + total_sample_count = 0 + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] + + expect_equal(info$n_samples, 0) + expect_equal(info$n_projects, 0) + expect_equal(nrow(sample_info), 0) + expect_null(info$bulk_projects) + expect_setequal( + colnames(sample_info), + c( + "scpca_sample_id", + "scpca_project_id", + "seq_unit", + "has_spatial", + "has_bulk", + "has_cite_seq", + "has_multiplexed" + ) + ) +}) + +test_that("get_dataset_info expands merged projects to all their single-cell samples", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000005 = list( + SINGLE_CELL = "MERGED", + SPATIAL = list(), + includes_bulk = FALSE + ) + ), + total_sample_count = 4 + ) + }, + get_project_samples = \(project_id, simplify = TRUE) { + if (project_id == "SCPCP000001") { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell") + ) + } else { + # merged project: all single-cell samples are included; the + # non-single-cell SCPCS000053 is not + tibble::tibble( + scpca_sample_id = c("SCPCS000050", "SCPCS000051", "SCPCS000052", "SCPCS000053"), + scpca_project_id = project_id, + has_single_cell_data = c(TRUE, TRUE, TRUE, FALSE), + has_spatial_data = c(FALSE, FALSE, FALSE, TRUE), + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell", "cell", "nucleus", "spot") + ) + } + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] + + # merged project still surfaced in merged_projects + expect_equal(info$merged_projects, "SCPCP000005") + # its single-cell samples are expanded into the table; SCPCS000053 is excluded + expect_setequal( + sample_info$scpca_sample_id, + c("SCPCS000001", "SCPCS000050", "SCPCS000051", "SCPCS000052") + ) + expect_false("SCPCS000053" %in% sample_info$scpca_sample_id) + # the nucleus seq_unit is reported for that sample + expect_equal( + sample_info$seq_unit[sample_info$scpca_sample_id == "SCPCS000052"], + "nucleus" + ) + expect_equal(info$n_projects, 2) + expect_equal(info$n_samples, 4) +}) + +test_that("get_dataset_info derives status from detail without a second API call", { + call_count <- 0 + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + call_count <<- call_count + 1 + list( + id = DATASET_ID, + format = "ANN_DATA", + is_started = TRUE, + is_succeeded = TRUE, + data = list(), + total_sample_count = 0 + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + + expect_equal(call_count, 1) + expect_equal(info$status, "succeeded") +}) + +test_that("get_dataset_info prunes projects where nothing is requested", { + local_mocked_bindings( + get_dataset_detail = \(dataset, auth_token) { + list( + id = DATASET_ID, + format = "SINGLE_CELL_EXPERIMENT", + is_started = FALSE, + data = list( + SCPCP000001 = list( + SINGLE_CELL = list("SCPCS000001"), + SPATIAL = list(), + includes_bulk = FALSE + ), + SCPCP000002 = list( + SINGLE_CELL = list(), + SPATIAL = list(), + includes_bulk = FALSE + ) + ), + total_sample_count = 1 + ) + }, + # only SCPCP000001 should be queried; SCPCP000002 requests nothing + get_project_samples = \(project_id, simplify = TRUE) { + tibble::tibble( + scpca_sample_id = "SCPCS000001", + scpca_project_id = project_id, + has_single_cell_data = TRUE, + has_spatial_data = FALSE, + has_bulk_rna_seq = FALSE, + has_cite_seq_data = FALSE, + has_multiplexed_data = FALSE, + seq_units = list("cell") + ) + } + ) + + info <- get_dataset_info(DATASET_ID, auth_token = "token") + sample_info <- info[["sample_info"]] + + expect_equal(nrow(sample_info), 1) + expect_equal(sample_info$scpca_project_id, "SCPCP000001") + expect_false("SCPCP000002" %in% sample_info$scpca_project_id) +}) + +test_that("get_dataset_info errors when auth_token is empty", { + expect_error( + get_dataset_info(DATASET_ID, auth_token = ""), + "Authorization token must be provided" + ) +}) + + test_that("get_ccdl_dataset_detail returns dataset fields including download_url", { with_mock_dir("ccdl_dataset_detail", { result <- get_ccdl_dataset_detail("abc123", auth_token = "test-token") @@ -619,10 +942,11 @@ test_that("replace_dataset_data PUTs a rebuilt data field without a format", { samples = "SCPCS000001" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_null(result$format) - expect_true("SCPCP000001" %in% names(result$data)) + expect_null(captured_req$body$data$format) + expect_true("SCPCP000001" %in% names(captured_req$body$data$data)) + expect_equal(result, DATASET_ID) }) # set_dataset_email tests @@ -641,9 +965,10 @@ test_that("set_dataset_email PUTs a new email", { auth_token = "token", email = "user@example.com" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_equal(result$email, "user@example.com") + expect_equal(captured_req$body$data$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("set_dataset_email errors when email is not a single string", { @@ -694,10 +1019,11 @@ test_that("start_dataset_processing PUTs start = TRUE for a pending dataset", { }, "processing started" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) - expect_true(result$start) - expect_null(result$email) + expect_true(captured_req$body$data$start) + expect_null(captured_req$body$data$email) + expect_equal(result, DATASET_ID) }) test_that("start_dataset_processing includes email in the same request when provided", { @@ -717,9 +1043,10 @@ test_that("start_dataset_processing includes email in the same request when prov auth_token = "token" ) ) - expect_equal(captured_req$method, "PUT") - expect_true(result$start) - expect_equal(result$email, "user@example.com") + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_true(captured_req$body$data$start) + expect_equal(captured_req$body$data$email, "user@example.com") + expect_equal(result, DATASET_ID) }) test_that("start_dataset_processing errors when email is not a single string", { @@ -754,7 +1081,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "is already processing" ) - expect_null(result) + expect_equal(result, DATASET_ID) expect_false(put_called) }) @@ -772,7 +1099,7 @@ test_that("start_dataset_processing emits a message and sends no request when al result <- start_dataset_processing(DATASET_ID, auth_token = "token"), "has already completed processing" ) - expect_null(result) + expect_equal(result, DATASET_ID) expect_false(put_called) }) @@ -792,7 +1119,7 @@ test_that("start_dataset_processing warns and retries when previously failed", { ), "previously failed to process" ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_true(captured_req$body$data$start) }) @@ -809,7 +1136,7 @@ test_that("start_dataset_processing restarts an expired dataset", { suppressMessages( start_dataset_processing(DATASET_ID, auth_token = "token") ) - expect_equal(captured_req$method, "PUT") + expect_equal(httr2::req_get_method(captured_req), "PUT") expect_true(captured_req$body$data$start) }) @@ -905,9 +1232,30 @@ test_that("remove_from_dataset_data drops whole projects", { expect_equal(names(result), "SCPCP000001") }) +test_that("remove_from_dataset_data errors when removing a sample from a merged project", { + existing <- list( + SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE) + ) + + expect_error( + remove_from_dataset_data(existing, samples = "SCPCS000001"), + "merged single-cell data" + ) +}) + +test_that("remove_from_dataset_data can drop a merged project wholesale", { + existing <- list( + SCPCP000001 = list(SINGLE_CELL = "MERGED", SPATIAL = list(), includes_bulk = FALSE), + SCPCP000002 = list(SINGLE_CELL = list("SCPCS000003"), SPATIAL = list(), includes_bulk = FALSE) + ) + + result <- remove_from_dataset_data(existing, projects = "SCPCP000001") + expect_equal(names(result), "SCPCP000002") +}) + # add_dataset_samples / remove_dataset_samples tests -test_that("add_dataset_samples merges new samples into existing data and PUTs", { +test_that("add_dataset_samples PUTs the merged data", { captured_req <- NULL local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { @@ -922,12 +1270,18 @@ test_that("add_dataset_samples merges new samples into existing data and PUTs", ) ) }, + # additions: one sample for the existing project, plus a brand-new project build_dataset_data = \(samples = NULL, projects = NULL, include_bulk = FALSE) { list( SCPCP000001 = list( SINGLE_CELL = list("SCPCS000002"), SPATIAL = list(), includes_bulk = include_bulk + ), + SCPCP000002 = list( + SINGLE_CELL = list("SCPCS000003"), + SPATIAL = list(), + includes_bulk = include_bulk ) ) }, @@ -940,16 +1294,27 @@ test_that("add_dataset_samples merges new samples into existing data and PUTs", result <- add_dataset_samples( DATASET_ID, auth_token = "token", - samples = "SCPCS000002" + samples = c("SCPCS000002", "SCPCS000003"), + include_bulk = TRUE ) - expect_equal(captured_req$method, "PUT") + + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) + + sent_data <- captured_req$body$data$data + expect_setequal(names(sent_data), c("SCPCP000001", "SCPCP000002")) + # existing project gains the new sample as a union of old and added IDs expect_setequal( - as.character(result$data$SCPCP000001$SINGLE_CELL), + as.character(sent_data$SCPCP000001$SINGLE_CELL), c("SCPCS000001", "SCPCS000002") ) + # include_bulk applies to the newly added project but not the existing one + expect_false(sent_data$SCPCP000001$includes_bulk) + expect_true(sent_data$SCPCP000002$includes_bulk) + expect_equal(result, DATASET_ID) }) -test_that("remove_dataset_samples removes a project and PUTs", { +test_that("remove_dataset_samples PUTs the reduced data", { captured_req <- NULL local_mocked_bindings( get_dataset_detail = \(dataset, auth_token) { @@ -957,7 +1322,7 @@ test_that("remove_dataset_samples removes a project and PUTs", { id = DATASET_ID, data = list( SCPCP000001 = list( - SINGLE_CELL = list("SCPCS000001"), + SINGLE_CELL = list("SCPCS000001", "SCPCS000002"), SPATIAL = list(), includes_bulk = FALSE ), @@ -978,8 +1343,16 @@ test_that("remove_dataset_samples removes a project and PUTs", { result <- remove_dataset_samples( DATASET_ID, auth_token = "token", + samples = "SCPCS000002", projects = "SCPCP000002" ) - expect_equal(captured_req$method, "PUT") - expect_equal(names(result$data), "SCPCP000001") + + expect_equal(httr2::req_get_method(captured_req), "PUT") + expect_match(captured_req$url, paste0("datasets/", DATASET_ID)) + + # SCPCP000002 dropped wholesale; SCPCP000001 keeps only the un-removed sample + sent_data <- captured_req$body$data$data + expect_equal(names(sent_data), "SCPCP000001") + expect_equal(as.character(sent_data$SCPCP000001$SINGLE_CELL), "SCPCS000001") + expect_equal(result, DATASET_ID) }) diff --git a/tests/testthat/test-projects.R b/tests/testthat/test-projects.R index a4725d6..a1bb578 100644 --- a/tests/testthat/test-projects.R +++ b/tests/testthat/test-projects.R @@ -21,29 +21,29 @@ test_that("scpca_projects returns simplified data frame by default", { expect_s3_class(projects_df$created_at, "POSIXct") expect_s3_class(projects_df$updated_at, "POSIXct") }) +}) - test_that("scpca_projects returns full data frame when simplify = FALSE", { - with_mock_dir("scpca_projects", { - projects_df <- scpca_projects(simplify = FALSE) +test_that("scpca_projects returns full data frame when simplify = FALSE", { + with_mock_dir("scpca_projects", { + projects_df <- scpca_projects(simplify = FALSE) - # Check that it returns a data frame - expect_s3_class(projects_df, "data.frame") + # Check that it returns a data frame + expect_s3_class(projects_df, "data.frame") - # Check that we have rows and columns - expect_gt(nrow(projects_df), 0) - expect_gt(ncol(projects_df), 0) + # Check that we have rows and columns + expect_gt(nrow(projects_df), 0) + expect_gt(ncol(projects_df), 0) - # Check that list columns are present (not simplified) - list_columns <- sapply(projects_df, is.list) - expect_true(any(list_columns)) + # Check that list columns are present (not simplified) + list_columns <- sapply(projects_df, is.list) + expect_true(any(list_columns)) - # Check for expected key columns - expect_contains(colnames(projects_df), "scpca_project_id") + # Check for expected key columns + expect_contains(colnames(projects_df), "scpca_project_id") - # Check that date columns are properly converted - expect_s3_class(projects_df$created_at, "POSIXct") - expect_s3_class(projects_df$updated_at, "POSIXct") - }) + # Check that date columns are properly converted + expect_s3_class(projects_df$created_at, "POSIXct") + expect_s3_class(projects_df$updated_at, "POSIXct") }) })