diff --git a/.Rbuildignore b/.Rbuildignore index c2c230f..655c812 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -4,4 +4,10 @@ ^README\.Rmd$ ^\.github$ ^data-raw$ +^dcs04_data$ +^dcs04_data/.*$ ^codecov\.yml$ +^\.Rhistory$ +^\.Rdata$ +^\.httr-oauth$ +^\.DS_Store$ diff --git a/.gitignore b/.gitignore index 8b78a3a..1fb1ed6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ .Rdata .httr-oauth .DS_Store +dcs04_data inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 1f9f199..9d4781e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: qsvaR Title: Generate Quality Surrogate Variable Analysis for Degradation Correction -Version: 1.9.0 -Date: 2024-05-03 +Version: 1.9.1 +Date: 2024-09-16 Authors@R: c( person("Joshua", "Stolz", email = "jstolz80@gmail.com", @@ -25,7 +25,7 @@ biocViews: Software, WorkflowStep, Normalization, BiologicalQuestion, DifferentialExpression, Sequencing, Coverage Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.2 Suggests: BiocFileCache, BiocStyle, diff --git a/NAMESPACE b/NAMESPACE index f492de8..efd1f82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,14 @@ # Generated by roxygen2: do not edit by hand export(DEqual) -export(check_tx_names) export(getDegTx) export(getPCs) export(get_qsvs) export(k_qsvs) +export(normalize_tx_names) export(qSVA) export(select_transcripts) +export(which_tx_names) import(SummarizedExperiment) import(ggplot2) import(rlang) diff --git a/R/DEqual.R b/R/DEqual.R index dd535d2..5027cc3 100644 --- a/R/DEqual.R +++ b/R/DEqual.R @@ -8,11 +8,22 @@ #' . This function compares your #' t-statistics of interest computed on transcripts against the #' t-statistics from degradation time adjusting for the six brain regions from -#' degradation experiment data used for determining `covComb_tx_deg`. +#' degradation experiment data used for determining `rse_tx`. #' -#' @param DE a `data.frame()` with one column containing the t-statistics from -#' Differential Expression, typically generated with `limma::topTable()`. -#' The `rownames(DE)` should be transcript GENCODE IDs. +#' @param DE a `data.frame()` with a column "t" containing the t-statistics +#' from Differential Expression, typically generated with `limma::topTable()`. +#' `rownames(DE)` must have transcript Ensembl/Gencode IDs. +#' @param deg_tstats an optional `data.frame()` with a column "t" containing +#' t-statistics resulted from a degradation experiment. Default is the +#' internal `qsvaR::degradation_tstats` from the package authors. +#' @param show.legend logical (default TRUE) to show legend in the plot +#' @param show.cor specify where to show the correlation value. Can be one of +#' "caption", "corner-top", "corner-bottom", or "none". +#' @param font.size numeric value to set the base font size of the plot +#' @param cor.size numeric (default font.size/2) to set the font size for the +#' correlation text +#' @param cor.label character (default "cor: ") to set the text preceding the +#' correlation value #' #' @return a `ggplot` object of the DE t-statistic vs #' the DE statistic from degradation @@ -36,47 +47,71 @@ #' #' ## Create the DEqual plot #' DEqual(random_de) -DEqual <- function(DE) { +DEqual <- function(DE, deg_tstats = qsvaR::degradation_tstats, show.legend = TRUE, + show.cor = c('caption','corner-top','corner-bottom','none'), + font.size = 12, cor.size = font.size/2, cor.label = 'cor: ') { ## For R CMD check DE_t <- degradation_t <- NULL - + show.cor=rlang::arg_match(show.cor) ## Check input - # stopifnot("t" %in% colnames(DE)) - # stopifnot(!is.null(rownames(DE))) - - # Check if input is a dataframe - if (!is.data.frame(DE)) { - stop("The input to DEqual is not a dataframe.", call. = FALSE) + if (!is.data.frame(DE)) { + stop("The input to DEqual is not a dataframe.", call. = FALSE) } - # Check if 't' is in the column names of DE if (!("t" %in% colnames(DE))) { stop("'t' is not a column in 'DE'.", call. = FALSE) } - # Check if DE has non-null row names if (is.null(rownames(DE))) { stop("Row names of 'DE' are NULL.", call. = FALSE) - } - + } + if (!missing(deg_tstats)) { + if (!is.data.frame(deg_tstats)) { + stop("The 'deg_tstats' parameter is not a dataframe.", call. = FALSE) + } + if (!("t" %in% colnames(deg_tstats))) { + stop("'t' is not a column in 'deg_tstats'.", call. = FALSE) + } + if (is.null(rownames(deg_tstats))) { + stop("Row names of 'deg_tstats' are NULL.", call. = FALSE) + } + if (!all(grepl("^ENST\\d+", rownames(deg_tstats)))) { + stop("The row names of 'deg_tstats' must be ENSEMBL or Gencode IDs (ENST...)", call. = FALSE) + } + } + ## Locate common transcripts - deg_tstats = qsvaR::degradation_tstats - rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats') - common = intersect(rownames(deg_tstats), rownames(DE)) - + whichTx <- which_tx_names(rownames(DE),rownames(deg_tstats)) + common = qsvaR::normalize_tx_names(rownames(DE)[whichTx]) stopifnot(length(common) > 0) - + rownames(deg_tstats) <- qsvaR::normalize_tx_names(rownames(deg_tstats)) ## Create dataframe with common transcripts common_data <- data.frame( - degradation_t = deg_tstats$t[match(common, rownames(deg_tstats))], - DE_t = DE$t[match(common, rownames(DE))] + degradation_t = deg_tstats[common, "t"], + DE_t = DE[whichTx, "t"] ) + cor_val <- signif(cor(common_data[, 1], common_data[, 2]), 2) p <- ggplot(common_data, aes(x = DE_t, y = degradation_t)) + xlab("DE t-statistic") + ylab("Degradation t-statistic") + - geom_bin2d(bins = 70) + + geom_bin2d(bins = 70, show.legend = show.legend) + scale_fill_continuous(type = "viridis") + theme_bw() + - labs(caption = paste0("correlation: ", signif(cor(common_data[, 1], common_data[, 2]), 2))) + theme(text = element_text(size = font.size)) + # labs(caption = paste0("correlation: ", cor_val) + if (show.cor != 'none') { + switch(show.cor, + 'caption' = { + p <- p + labs(caption = paste0(cor.label, cor_val)) + }, + 'corner-top' = { + p <- p + annotate("text", x = max(common_data$DE_t), y = max(common_data$degradation_t), + label = paste0(cor.label, cor_val), size = cor.size, hjust = 1, vjust = 1) + }, + 'corner-bottom' = { + p <- p + annotate("text", x = max(common_data$DE_t), y = min(common_data$degradation_t), + label = paste0(cor.label, cor_val), size = cor.size, hjust = 1, vjust = 0) + }) + } return(p) } diff --git a/R/covComb_tx_deg-data.R b/R/covComb_tx_deg-data.R deleted file mode 100644 index 509b498..0000000 --- a/R/covComb_tx_deg-data.R +++ /dev/null @@ -1,14 +0,0 @@ -#' RSE object of RNA-seq data that serves as output for degradation analysis -#' -#' This data was generated from an experiment using degraded RNA-seq samples -#' post-mortem brain tissue. The transcripts included are the result of the qsva -#' expanded framework study and will be used to remove the effect of degradation -#' in bulk RNA-seq data. -#' -#' -#' @name covComb_tx_deg -#' @docType data -#' @format A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] -#' @keywords datasets -#' @seealso [getPCs] [k_qsvs] [getDegTx] -NULL diff --git a/R/degradation_tstats-data.R b/R/degradation_tstats-data.R index cb794de..514f888 100644 --- a/R/degradation_tstats-data.R +++ b/R/degradation_tstats-data.R @@ -1,10 +1,10 @@ #' Degradation time t-statistics #' -#' These t-statistics are derived from the same data that was used for -#' [covComb_tx_deg]. They are the results from main model where we determined -#' the relationship with degradation time adjusting for the brain region (so -#' parallel degradation effects across brain regions). They are used for -#' plotting in `DEqual()`. +#' These t-statistics are derived from the degradation timepoints data +#' built into qsvaR. They are the results from multiple models where +#' we determined the association of transcripts with degradation time +#' adjusting for brain region (so parallel degradation effects across +#' brain regions). They are used for plotting in `DEqual()`. #' #' @name degradation_tstats #' @docType data diff --git a/R/getDegTx.R b/R/getDegTx.R index 9a65b17..f9e4f7e 100644 --- a/R/getDegTx.R +++ b/R/getDegTx.R @@ -19,8 +19,9 @@ #' These proportions were then added to our `model.matrix()` and the union of the top 1000 transcripts in the interaction model, #' the main effect model, and the cell proportions model were used to generate this model of qSVs. #' -#' @param assayname character string specifying the name of the assay desired in rse_tx #' @param sig_transcripts A list of transcripts determined to have degradation signal in the qsva expanded paper. +#' @param assayname character string specifying the name of the assay desired in rse_tx +#' @param verbose specify if the function should report how many model transcripts were matched #' #' @return A #' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] @@ -31,28 +32,38 @@ #' @import rlang #' #' @examples -#' getDegTx(covComb_tx_deg) -#' stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1) -getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = select_transcripts(type), assayname = "tpm") { - - type = arg_match(type) - +#' degTx <- getDegTx(rse_tx, "standard") +getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), + sig_transcripts = NULL, assayname = "tpm", verbose = TRUE) { + + #type = arg_match(type) + if (is.null(sig_transcripts)) { + type = arg_match(type) + sig_transcripts <- select_transcripts(type) + } else { + type = "custom" + } # Validate rse_tx is a RangedSummarizedExperiment object if (!is(rse_tx, "RangedSummarizedExperiment")) { stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE) } - + # Check if assayname is in assayNames if (!assayname %in% assayNames(rse_tx)) { stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE) } - - # Check for validity and matching of tx names - sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts') - - # Subset rse_tx to include sig_transcripts - rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE] - + + # Check for validity and matching of tx names and return the tx subset indexes in rse_tx + wtx <- which_tx_names(rownames(rse_tx), sig_transcripts) + if (length(wtx) == 0) { + stop("No transcripts found in the '",type, "' degradation model transcripts" ) + } + + if (verbose) { + message(" '",type,"' degradation model transcripts found: ", length(wtx)) + } + rse_tx <- rse_tx[wtx, , drop = FALSE] + # Check if the row means is greater than 1 if (mean(rowMeans(assays(rse_tx)[[assayname]])) < 1) { warning("The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.") diff --git a/R/getPCs.R b/R/getPCs.R index 83412d9..08d66a3 100644 --- a/R/getPCs.R +++ b/R/getPCs.R @@ -11,7 +11,7 @@ #' @importFrom stats prcomp #' @import SummarizedExperiment #' @examples -#' getPCs(covComb_tx_deg, "tpm") +#' getPCs(rse_tx, "tpm") getPCs <- function(rse_tx, assayname = "tpm") { # Validate rse_tx is a RangedSummarizedExperiment object diff --git a/R/get_qsvs.R b/R/get_qsvs.R index 22d1d62..6c54109 100644 --- a/R/get_qsvs.R +++ b/R/get_qsvs.R @@ -12,7 +12,7 @@ #' @export #' #' @examples -#' qsv <- getPCs(covComb_tx_deg, "tpm") +#' qsv <- getPCs(rse_tx, "tpm") #' get_qsvs(qsv, 2) get_qsvs <- function(qsvPCs, k) { diff --git a/R/k_qsvs.R b/R/k_qsvs.R index 79b0e0d..3903f5b 100644 --- a/R/k_qsvs.R +++ b/R/k_qsvs.R @@ -14,10 +14,10 @@ #' @import SummarizedExperiment #' @examples #' ## First we need to define a statistical model. We'll use the example -#' ## covComb_tx_deg data. Note that the model you'll use in your own data +#' ## rse_tx data. Note that the model you'll use in your own data #' ## might look different from this model. #' mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, -#' data = colData(covComb_tx_deg) +#' data = colData(rse_tx) #' ) #' #' ## To ensure that the results are reproducible, you will need to set a @@ -25,7 +25,7 @@ #' ## sva::num.sv() which needs a random seed to ensure reproducibility of the #' ## results. #' set.seed(20230621) -#' k_qsvs(covComb_tx_deg, mod, "tpm") +#' k_qsvs(rse_tx, mod, "tpm") k_qsvs <- function(rse_tx, mod, assayname) { # Validate rse_tx is a RangedSummarizedExperiment object diff --git a/R/qSVA.R b/R/qSVA.R index ad12fe1..a2d8027 100644 --- a/R/qSVA.R +++ b/R/qSVA.R @@ -3,14 +3,14 @@ #' @param rse_tx A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] object containing #' the transcript data desired to be studied. #' @param type a character string specifying which model you would -#' like to use when selecting a degradation matrix. -#' @param sig_transcripts A list of transcripts that are associated with -#' degradation signal. Use `select_transcripts()` to select sets of transcripts -#' identified by the qSVA expanded paper. Specifying a `character()` input of -#' ENSEMBL transcript IDs (or whatever values you have at `rownames(rse_tx)`) -#' obtained outside of `select_transcripts()` overrides -#' the user friendly `type` argument. That is, this argument provides more fine -#' tuning options for advanced users. +#' like to use from the sets of signature transcripts identified +#' by the qsvaR package. This can be omitted if a custom set of +#' transcripts is provided to sig_transcripts. +#' @param sig_transcripts A list of transcript IDs that are associated +#' with degradation signal. Specifying a `character()` input with ENSEMBL +#' transcript IDs (whose values should match entries in `rownames(rse_tx)`). +#' This argument provides a custom list of transcripts for adjusting +#' for degradation; this should be used instead of the `type` argument. #' @param mod Model Matrix with necessary variables the you would #' model for in differential expression #' @param assayname character string specifying the name of @@ -21,10 +21,10 @@ #' #' @examples #' ## First we need to define a statistical model. We'll use the example -#' ## covComb_tx_deg data. Note that the model you'll use in your own data +#' ## rse_tx data. Note that the model you'll use in your own data #' ## might look different from this model. #' mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, -#' data = colData(covComb_tx_deg) +#' data = colData(rse_tx) #' ) #' #' ## To ensure that the results are reproducible, you will need to set a @@ -32,32 +32,29 @@ #' ## sva::num.sv() which needs a random seed to ensure reproducibility of the #' ## results. #' set.seed(20230621) -#' qSVA(rse_tx = covComb_tx_deg, type = "cell_component", mod = mod, assayname = "tpm") +#' qSVA(rse_tx = rse_tx, type = "cell_component", mod = mod, assayname = "tpm") #' qSVA <- - function(rse_tx, - type = c("cell_component", "standard", "top1500"), - sig_transcripts = select_transcripts(type), - mod, - assayname) { - ## We don't need to pass type to getDegTx() since it's not used internally - ## once the sig_transcripts have been defined. - - type = arg_match(type) - + function(rse_tx, type = c("cell_component", "standard", "top1500"), + sig_transcripts = NULL, mod, assayname) { + + if (is.null(sig_transcripts)) { + type = arg_match(type) # must be one of those in the list if sig_transcripts is NULL + } + # Validate rse_tx is a RangedSummarizedExperiment object if (!is(rse_tx, "RangedSummarizedExperiment")) { stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE) } - + # Check if assayname is in assayNames if (!assayname %in% assayNames(rse_tx)) { stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE) } - + # Get the qSVs DegTx <- - getDegTx(rse_tx, sig_transcripts = sig_transcripts, assayname = assayname) + getDegTx(rse_tx, type = type, sig_transcripts = sig_transcripts, assayname = assayname) PCs <- getPCs(DegTx, assayname) k <- k_qsvs(DegTx, mod = mod, assayname = assayname) qSV <- get_qsvs(PCs, k) diff --git a/R/rse_tx-data.R b/R/rse_tx-data.R new file mode 100644 index 0000000..d4ab408 --- /dev/null +++ b/R/rse_tx-data.R @@ -0,0 +1,13 @@ +#' Example of RSE object with RNA-seq transcript quantification data +#' +#' This data is a [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] +#' with transcript quantification data stored in an "tpm" assay. It is +#' used to demonstrate the use of qsvaR in bulk RNA-seq data. +#' +#' +#' @name rse_tx +#' @docType data +#' @format A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] +#' @keywords datasets +#' @seealso [getPCs] [k_qsvs] [getDegTx] [qSVA] +NULL diff --git a/R/utils.R b/R/utils.R index 57cc94b..f8f202d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,59 +1,55 @@ -#' Check validity of transcript vectors + + +#' Remove version number from Gencode/Ensembl transcript names #' -#' This function is used to check if the tx1 and tx2 are GENCODE or ENSEMBL and print an error message if it's not and return a character vector of transcripts in tx2 that are in tx1. +#' This function removes the Gencode/ENSEMBL version from the transcript ID, while protecting _PAR_Y suffixes if present #' -#' @param tx1 A `character()` vector of GENCODE or ENSEMBL transcripts. -#' @param tx2 A `character()` vector of GENCODE or ENSEMBL transcripts. +#' @param txnames A `character()` vector of GENCODE or ENSEMBL transcript IDs #' -#' @param arg_name1 A `character(1)` vector of description of tx1 -#' @param arg_name2 A `character(1)` vector of description of tx2 #' #' @return A -#' `character()` vector of transcripts in `tx2` that are in `tx1`. +#' `character()` vector of transcript names without versioning #' #' @export #' #' @examples -#' sig_transcripts = select_transcripts("cell_component") -#' check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts') +#' ensIDs <- normalize_tx_names(rownames(rse_tx)) +normalize_tx_names <- function(txnames) { + sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE) +} -check_tx_names = function(tx1, tx2, arg_name1, arg_name2) { - # Functions for checking whether a vector of transcripts all match GENCODE - # or ENSEMBL naming conventions - is_gencode = function(x) all(grepl("^ENST.*?\\.", x)) - is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x)) - - # Check that both vectors either follow GENCODE or ENSEMBL - if (!is_gencode(tx1) && !is_ensembl(tx1)) { - stop( - sprintf( - "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", - arg_name1 - ) - ) - } - if (!is_gencode(tx2) && !is_ensembl(tx2)) { - stop( - sprintf( - "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", - arg_name2 - ) - ) - } - - # Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE - # but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further - # down) - if (is_gencode(tx2) && is_ensembl(tx1)) { - tx2 = sub('\\..*', '', tx2) + +#' Check validity of transcript vectors and return a vector matching indexes in tx1 +#' +#' This function is used to check if tx1 and tx2 are GENCODE or ENSEMBL transcript IDs +#' and return an integer vector of tx1 transcript indexes that are in tx2. +#' +#' @param txnames A `character()` vector of GENCODE or ENSEMBL transcript IDs. +#' @param sig_tx A `character()` vector of GENCODE or ENSEMBL signature transcript IDs. +#' +#' +#' @return A +#' `integer()` vector of `txnames` transcript indexes in `sig_tx`. +#' +#' @export +#' +#' @examples +#' sig_tx <- select_transcripts("cell_component") +#' whichTx <- which_tx_names(rownames(rse_tx), sig_tx) + +which_tx_names = function(txnames, sig_tx) { + ## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers. + ## Since release 44, these have their own IDs + if (!all(grepl("^ENST\\d+", txnames))) { + stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" ) } - - # At least some transcripts must overlap between 'tx1' and 'tx2' - if (!any(tx2 %in% tx1)) { - stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1)) + if (!all(grepl("^ENST\\d+", sig_tx))) { + stop("The signature transcript names must be ENSEMBL or Gencode IDs (ENST...)" ) } - - # Since only 'tx2' was modified, return the changed copy - return(tx2) -} \ No newline at end of file + ## normalize the transcript names + r_tx <- normalize_tx_names(txnames) + s_tx <- normalize_tx_names(sig_tx) + which(r_tx %in% s_tx) +} + diff --git a/data-raw/covComb_tx_deg.R b/data-raw/covComb_tx_deg.R deleted file mode 100644 index f81517f..0000000 --- a/data-raw/covComb_tx_deg.R +++ /dev/null @@ -1,6 +0,0 @@ -## code to prepare `covComb_tx_deg` dataset goes here - -## change to jhpce path -load("data/covComb_tx_deg.rda") - -usethis::use_data(covComb_tx_deg, overwrite = TRUE) diff --git a/data-raw/rse_tx.R b/data-raw/rse_tx.R new file mode 100644 index 0000000..fb58494 --- /dev/null +++ b/data-raw/rse_tx.R @@ -0,0 +1,5 @@ +## code to prepare `rse_tx` example goes here + +load("data/rse_tx.rda") +# the above loads rse_tx +usethis::use_data(rse_tx, overwrite = TRUE) diff --git a/data/covComb_tx_deg.rda b/data/covComb_tx_deg.rda deleted file mode 100644 index 08e72ca..0000000 Binary files a/data/covComb_tx_deg.rda and /dev/null differ diff --git a/data/rse_tx.rda b/data/rse_tx.rda new file mode 100644 index 0000000..cc3ff6d Binary files /dev/null and b/data/rse_tx.rda differ diff --git a/man/DEqual.Rd b/man/DEqual.Rd index c4b3b3e..a47fbd1 100644 --- a/man/DEqual.Rd +++ b/man/DEqual.Rd @@ -4,12 +4,37 @@ \alias{DEqual} \title{Differential expression quality (DEqual) plot} \usage{ -DEqual(DE) +DEqual( + DE, + deg_tstats = qsvaR::degradation_tstats, + show.legend = TRUE, + show.cor = c("caption", "corner-top", "corner-bottom", "none"), + font.size = 12, + cor.size = font.size/2, + cor.label = "cor: " +) } \arguments{ -\item{DE}{a \code{data.frame()} with one column containing the t-statistics from -Differential Expression, typically generated with \code{limma::topTable()}. -The \code{rownames(DE)} should be transcript GENCODE IDs.} +\item{DE}{a \code{data.frame()} with a column "t" containing the t-statistics +from Differential Expression, typically generated with \code{limma::topTable()}. +\code{rownames(DE)} must have transcript Ensembl/Gencode IDs.} + +\item{deg_tstats}{an optional\code{data.frame()} with a column "t" containing +t-statistics resulted from a degradation experiment. Default is the +internal \code{qsvaR::degradation_tstats} from the package authors.} + +\item{show.legend}{logical (default TRUE) to show legend in the plot} + +\item{show.cor}{specify where to show the correlation value. Can be one of +"caption", "corner-top", "corner-bottom", or "none".} + +\item{font.size}{numeric value to set the base font size of the plot} + +\item{cor.size}{numeric (default font.size/2) to set the font size for the +correlation text} + +\item{cor.label}{character (default "cor: ") to set the text preceding the +correlation value} } \value{ a \code{ggplot} object of the DE t-statistic vs @@ -24,7 +49,7 @@ included in Collado-Torres et al, Neuron, 2019 \url{https://doi.org/10.1016/j.neuron.2019.05.013}. This function compares your t-statistics of interest computed on transcripts against the t-statistics from degradation time adjusting for the six brain regions from -degradation experiment data used for determining \code{covComb_tx_deg}. +degradation experiment data used for determining \code{rse_tx}. } \examples{ diff --git a/man/check_tx_names.Rd b/man/check_tx_names.Rd deleted file mode 100644 index bc3c515..0000000 --- a/man/check_tx_names.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{check_tx_names} -\alias{check_tx_names} -\title{Check validity of transcript vectors} -\usage{ -check_tx_names(tx1, tx2, arg_name1, arg_name2) -} -\arguments{ -\item{tx1}{A \code{character()} vector of GENCODE or ENSEMBL transcripts.} - -\item{tx2}{A \code{character()} vector of GENCODE or ENSEMBL transcripts.} - -\item{arg_name1}{A \code{character(1)} vector of description of tx1} - -\item{arg_name2}{A \code{character(1)} vector of description of tx2} -} -\value{ -A -\code{character()} vector of transcripts in \code{tx2} that are in \code{tx1}. -} -\description{ -This function is used to check if the tx1 and tx2 are GENCODE or ENSEMBL and print an error message if it's not and return a character vector of transcripts in tx2 that are in tx1. -} -\examples{ -sig_transcripts = select_transcripts("cell_component") -check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts') -} diff --git a/man/covComb_tx_deg.Rd b/man/covComb_tx_deg.Rd deleted file mode 100644 index 623f143..0000000 --- a/man/covComb_tx_deg.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/covComb_tx_deg-data.R -\docType{data} -\name{covComb_tx_deg} -\alias{covComb_tx_deg} -\title{RSE object of RNA-seq data that serves as output for degradation analysis} -\format{ -A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} -} -\description{ -This data was generated from an experiment using degraded RNA-seq samples -post-mortem brain tissue. The transcripts included are the result of the qsva -expanded framework study and will be used to remove the effect of degradation -in bulk RNA-seq data. -} -\seealso{ -\link{getPCs} \link{k_qsvs} \link{getDegTx} -} -\keyword{datasets} diff --git a/man/degradation_tstats.Rd b/man/degradation_tstats.Rd index 31e1413..545c578 100644 --- a/man/degradation_tstats.Rd +++ b/man/degradation_tstats.Rd @@ -9,11 +9,11 @@ A \code{data.frame()} with the \code{t} statistics for degradation time. The \code{rownames()} are the GENCODE transcript IDs. } \description{ -These t-statistics are derived from the same data that was used for -\link{covComb_tx_deg}. They are the results from main model where we determined -the relationship with degradation time adjusting for the brain region (so -parallel degradation effects across brain regions). They are used for -plotting in \code{DEqual()}. +These t-statistics are derived from the degradation timepoints data +built into qsvaR. They are the results from multiple models where +we determined the association of transcripts with degradation time +adjusting for brain region (so parallel degradation effects across +brain regions). They are used for plotting in \code{DEqual()}. } \seealso{ \link{DEqual} diff --git a/man/getDegTx.Rd b/man/getDegTx.Rd index 2765b17..baefbbb 100644 --- a/man/getDegTx.Rd +++ b/man/getDegTx.Rd @@ -7,8 +7,9 @@ getDegTx( rse_tx, type = c("cell_component", "standard", "top1500"), - sig_transcripts = select_transcripts(type), - assayname = "tpm" + sig_transcripts = NULL, + assayname = "tpm", + verbose = TRUE ) } \arguments{ @@ -30,6 +31,8 @@ the main effect model, and the cell proportions model were used to generate this \item{sig_transcripts}{A list of transcripts determined to have degradation signal in the qsva expanded paper.} \item{assayname}{character string specifying the name of the assay desired in rse_tx} + +\item{verbose}{specify if the function should report how many model transcripts were matched} } \value{ A @@ -43,6 +46,5 @@ postmortem brain tissues. This object can later be used to obtain the principle necessary to remove the effect of degradation in differential expression. } \examples{ -getDegTx(covComb_tx_deg) -stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1) +degTx <- getDegTx(rse_tx, "standard") } diff --git a/man/getPCs.Rd b/man/getPCs.Rd index 83f3ae7..9a50d78 100644 --- a/man/getPCs.Rd +++ b/man/getPCs.Rd @@ -18,5 +18,5 @@ prcomp object generated by taking the pcs of degraded transcripts This function returns the pcs from the obtained RangedSummarizedExperiment object of selected transcripts } \examples{ -getPCs(covComb_tx_deg, "tpm") +getPCs(rse_tx, "tpm") } diff --git a/man/get_qsvs.Rd b/man/get_qsvs.Rd index 2a01244..68a51d2 100644 --- a/man/get_qsvs.Rd +++ b/man/get_qsvs.Rd @@ -20,6 +20,6 @@ Using the pcs and the k number of components be included, we generate the qsva matrix. } \examples{ -qsv <- getPCs(covComb_tx_deg, "tpm") +qsv <- getPCs(rse_tx, "tpm") get_qsvs(qsv, 2) } diff --git a/man/k_qsvs.Rd b/man/k_qsvs.Rd index 0fd9b6a..90cb1a4 100644 --- a/man/k_qsvs.Rd +++ b/man/k_qsvs.Rd @@ -23,10 +23,10 @@ Apply num.sv algorithm to determine the number of pcs to be included } \examples{ ## First we need to define a statistical model. We'll use the example -## covComb_tx_deg data. Note that the model you'll use in your own data +## rse_tx data. Note that the model you'll use in your own data ## might look different from this model. mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, - data = colData(covComb_tx_deg) + data = colData(rse_tx) ) ## To ensure that the results are reproducible, you will need to set a @@ -34,5 +34,5 @@ mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, ## sva::num.sv() which needs a random seed to ensure reproducibility of the ## results. set.seed(20230621) -k_qsvs(covComb_tx_deg, mod, "tpm") +k_qsvs(rse_tx, mod, "tpm") } diff --git a/man/normalize_tx_names.Rd b/man/normalize_tx_names.Rd new file mode 100644 index 0000000..5e4174b --- /dev/null +++ b/man/normalize_tx_names.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{normalize_tx_names} +\alias{normalize_tx_names} +\title{Remove version number from Gencode/Ensembl transcript names} +\usage{ +normalize_tx_names(txnames) +} +\arguments{ +\item{txnames}{A \code{character()} vector of GENCODE or ENSEMBL transcript IDs} +} +\value{ +A +\code{character()} vector of transcript names without versioning +} +\description{ +This function removes the Gencode/ENSEMBL version from the transcript ID, while protecting _PAR_Y suffixes if present +} +\examples{ +ensIDs <- normalize_tx_names(rownames(rse_tx)) +} diff --git a/man/qSVA.Rd b/man/qSVA.Rd index 48b5fb5..30bae9b 100644 --- a/man/qSVA.Rd +++ b/man/qSVA.Rd @@ -7,7 +7,7 @@ qSVA( rse_tx, type = c("cell_component", "standard", "top1500"), - sig_transcripts = select_transcripts(type), + sig_transcripts = NULL, mod, assayname ) @@ -17,15 +17,15 @@ qSVA( the transcript data desired to be studied.} \item{type}{a character string specifying which model you would -like to use when selecting a degradation matrix.} +like to use from the sets of signature transcripts identified +by the qsvaR package. This can be omitted if a custom set of +transcripts is provided to sig_transcripts.} -\item{sig_transcripts}{A list of transcripts that are associated with -degradation signal. Use \code{select_transcripts()} to select sets of transcripts -identified by the qSVA expanded paper. Specifying a \code{character()} input of -ENSEMBL transcript IDs (or whatever values you have at \code{rownames(rse_tx)}) -obtained outside of \code{select_transcripts()} overrides -the user friendly \code{type} argument. That is, this argument provides more fine -tuning options for advanced users.} +\item{sig_transcripts}{A list of transcript IDs that are associated +with degradation signal. Specifying a \code{character()} input with ENSEMBL +transcript IDs (whose values should match entries in \code{rownames(rse_tx)}). +This argument provides a custom list of transcripts for adjusting +for degradation; this should be used instead of the \code{type} argument.} \item{mod}{Model Matrix with necessary variables the you would model for in differential expression} @@ -41,10 +41,10 @@ A wrapper function used to perform qSVA in one step. } \examples{ ## First we need to define a statistical model. We'll use the example -## covComb_tx_deg data. Note that the model you'll use in your own data +## rse_tx data. Note that the model you'll use in your own data ## might look different from this model. mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, - data = colData(covComb_tx_deg) + data = colData(rse_tx) ) ## To ensure that the results are reproducible, you will need to set a @@ -52,6 +52,6 @@ mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, ## sva::num.sv() which needs a random seed to ensure reproducibility of the ## results. set.seed(20230621) -qSVA(rse_tx = covComb_tx_deg, type = "cell_component", mod = mod, assayname = "tpm") +qSVA(rse_tx = rse_tx, type = "cell_component", mod = mod, assayname = "tpm") } diff --git a/man/rse_tx.Rd b/man/rse_tx.Rd new file mode 100644 index 0000000..4edc43e --- /dev/null +++ b/man/rse_tx.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rse_tx-data.R +\docType{data} +\name{rse_tx} +\alias{rse_tx} +\title{Example of RSE object with RNA-seq transcript quantification data} +\format{ +A \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} +} +\description{ +This data is a \link[SummarizedExperiment:RangedSummarizedExperiment-class]{RangedSummarizedExperiment-class} +with transcript quantification data stored in an "tpm" assay. It is +used to demonstrate the use of qsvaR in bulk RNA-seq data. +} +\seealso{ +\link{getPCs} \link{k_qsvs} \link{getDegTx} \link{qSVA} +} +\keyword{datasets} diff --git a/man/which_tx_names.Rd b/man/which_tx_names.Rd new file mode 100644 index 0000000..e0f7f37 --- /dev/null +++ b/man/which_tx_names.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{which_tx_names} +\alias{which_tx_names} +\title{Check validity of transcript vectors and return a vector matching indexes in tx1} +\usage{ +which_tx_names(txnames, sig_tx) +} +\arguments{ +\item{txnames}{A \code{character()} vector of GENCODE or ENSEMBL transcript IDs.} + +\item{sig_tx}{A \code{character()} vector of GENCODE or ENSEMBL signature transcript IDs.} +} +\value{ +A +\code{integer()} vector of \code{txnames} transcript indexes in \code{sig_tx}. +} +\description{ +This function is used to check if tx1 and tx2 are GENCODE or ENSEMBL transcript IDs +and return an integer vector of tx1 transcript indexes that are in tx2. +} +\examples{ +sig_tx <- select_transcripts("cell_component") +whichTx <- which_tx_names(rownames(rse_tx), sig_tx) +} diff --git a/tests/testthat/test-DEqual.R b/tests/testthat/test-DEqual.R index 3cb4f04..9e2ea56 100644 --- a/tests/testthat/test-DEqual.R +++ b/tests/testthat/test-DEqual.R @@ -12,7 +12,7 @@ random_de <- data.frame( # Test if DEqual throws an error when input is not a dataframe test_that("DEqual throws an error for non-dataframe input", { # Test if DEqual throws an error when input is not a dataframe - expect_error(DEqual(covComb_tx_deg), "The input to DEqual is not a dataframe.") + expect_error(DEqual(rse_tx), "The input to DEqual is not a dataframe.") }) # Test when 't' is not in the columns diff --git a/tests/testthat/test-getDegTx.R b/tests/testthat/test-getDegTx.R index 4f2bc7a..fbb8f5e 100644 --- a/tests/testthat/test-getDegTx.R +++ b/tests/testthat/test-getDegTx.R @@ -1,49 +1,42 @@ # Filter out lowly expressed transcripts and test if the number of rows in getDegTx output matches expected transcript count -rse_tx_low <- covComb_tx_deg[rowMeans(assays(covComb_tx_deg)$tpm) < 1, ] +rse_tx_low <- rse_tx[rowMeans(assays(rse_tx)$tpm) < 1, ] test_that("length for number of rows is the same a length sig_transcripts", { - expect_equal(length(rownames(getDegTx(covComb_tx_deg))), length(select_transcripts("cell_component"))) + expect_equal(length(rownames(getDegTx(rse_tx))), length(select_transcripts("cell_component"))) }) # Test if number of columns in getDegTx output matches number of columns in original dataset test_that("length for number of columns is the same a length sig_transcripts", { - expect_equal(length(colnames(getDegTx(covComb_tx_deg))), length(colnames(covComb_tx_deg))) + expect_equal(length(colnames(getDegTx(rse_tx))), length(colnames(rse_tx))) }) # Test if getDegTx returns an object of the same class as its input test_that("output is an RSE", { - expect_equal(class(getDegTx(covComb_tx_deg)), class(covComb_tx_deg)) + expect_equal(class(getDegTx(rse_tx)), class(rse_tx)) }) # Test for a warning when getDegTx is used on a dataset with lowly expressed transcripts test_that("test warning output for lowly expressed transcripts", { - expect_warning(getDegTx(rse_tx_low), "The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.") + expect_warning(getDegTx(rse_tx_low, verbose=FALSE), + "The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.") }) # Test for rownames starting with "ENST" -test_that("getDegTx correctly processes covComb_tx_deg", { - # If covComb_tx_deg is correctly structured and all rownames start with "ENST", expect no error - expect_silent(getDegTx(covComb_tx_deg)) +test_that("getDegTx correctly processes rse_tx", { + # If rse_tx is correctly structured and all rownames start with "ENST", expect no error + expect_silent(getDegTx(rse_tx, verbose=FALSE)) }) -# Test where at least one sig_transcript is in covComb_tx_deg rownames -test_that("At least one sig_transcript is in covComb_tx_deg rownames", { - sig_transcripts <- select_transcripts("cell_component") - expect_silent({ - # Check if any of the sig_transcripts are in covComb_tx_deg rownames - getDegTx(covComb_tx_deg, sig_transcripts = sig_transcripts) - }) -}) # Test whether getDegTx gives the same results with original and altered row names test_that("getDegTx works with original and altered row names", { set.seed(123) - # Apply getDegTx to covComb_tx_deg - original_results <- getDegTx(covComb_tx_deg,sig_transcripts =select_transcripts("cell_component")) - - # Alter the row names of covComb_tx_deg and apply getDegTx - altered_covComb_tx_deg <- covComb_tx_deg - rownames(altered_covComb_tx_deg) <- gsub("\\..*", "", rownames(covComb_tx_deg)) - altered_results <- getDegTx(altered_covComb_tx_deg,sig_transcripts =select_transcripts("cell_component")) + # Apply getDegTx to rse_tx + original_results <- getDegTx(rse_tx, sig_transcripts =select_transcripts("cell_component")) + + # Alter the row names of rse_tx and apply getDegTx + altered_rse_tx <- rse_tx + rownames(altered_rse_tx) <- gsub("\\..\\d+", "", rownames(rse_tx)) + altered_results <- getDegTx(altered_rse_tx, sig_transcripts =select_transcripts("cell_component")) rownames(altered_results) <- rownames(original_results) # Test if two objects identical expect_identical(original_results, altered_results) @@ -51,7 +44,7 @@ test_that("getDegTx works with original and altered row names", { # Test for assayname not in assayNames test_that("getDegTx throws an error when assayname is not in assayNames", { - expect_error(getDegTx(covComb_tx_deg, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") + expect_error(getDegTx(rse_tx, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") }) # Test for input is an rse object diff --git a/tests/testthat/test-getPCs.R b/tests/testthat/test-getPCs.R index 13d359a..35e75e8 100644 --- a/tests/testthat/test-getPCs.R +++ b/tests/testthat/test-getPCs.R @@ -1,10 +1,10 @@ test_that("output is a prcomp", { - expect_equal(class(getPCs(covComb_tx_deg, "tpm")), "prcomp") + expect_equal(class(getPCs(rse_tx, "tpm")), "prcomp") }) # Test for assayname not in assayNames test_that("getPCs throws an error when assayname is not in assayNames", { - expect_error(getPCs(covComb_tx_deg, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") + expect_error(getPCs(rse_tx, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") }) # Test for input is an rse object diff --git a/tests/testthat/test-get_qsvs.R b/tests/testthat/test-get_qsvs.R index d668299..4667ce4 100644 --- a/tests/testthat/test-get_qsvs.R +++ b/tests/testthat/test-get_qsvs.R @@ -1,13 +1,13 @@ -mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(covComb_tx_deg)) -k <- k_qsvs(covComb_tx_deg, mod, "tpm") -qsv <- getPCs(covComb_tx_deg, "tpm") +mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(rse_tx)) +k <- k_qsvs(rse_tx, mod, "tpm") +qsv <- getPCs(rse_tx, "tpm") test_that("number of qsvs is k", { expect_equal(length(colnames(get_qsvs(qsv, k))), k) }) test_that("length of qsv rownames is the same as number of samples", { - expect_equal(length(rownames(get_qsvs(qsv, k))), length(colnames(covComb_tx_deg))) + expect_equal(length(rownames(get_qsvs(qsv, k))), length(colnames(rse_tx))) }) test_that("output is a matrix", { @@ -28,5 +28,5 @@ test_that("k is higher than the number of columns throws an error", { }) test_that("input has to be a prcomp", { - expect_error(get_qsvs(covComb_tx_deg, 3), "'qsvPCs' must be a prcomp object.") + expect_error(get_qsvs(rse_tx, 3), "'qsvPCs' must be a prcomp object.") }) \ No newline at end of file diff --git a/tests/testthat/test-k_qsvs.R b/tests/testthat/test-k_qsvs.R index e092a38..3d525ab 100644 --- a/tests/testthat/test-k_qsvs.R +++ b/tests/testthat/test-k_qsvs.R @@ -1,7 +1,7 @@ -mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(covComb_tx_deg)) +mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(rse_tx)) # rse_tx_low has low expression -rse_tx_low <- covComb_tx_deg[rowMeans(assays(covComb_tx_deg)$tpm) < 1, ] +rse_tx_low <- rse_tx[rowMeans(assays(rse_tx)$tpm) < 1, ] # mod2 is not full rank mod2 <- mod @@ -10,7 +10,7 @@ colnames(mod2)[8] <- "mitotest" ## Set a random seed set.seed(20230621) -k_res <- k_qsvs(covComb_tx_deg, mod, "tpm") +k_res <- k_qsvs(rse_tx, mod, "tpm") test_that("length pf output is 1", { expect_equal(length(k_res), 1) @@ -32,30 +32,30 @@ test_that("getDegTx throws an error when input is not a RangedSummarizedExperime # Test for assayname not in assayNames test_that("k_qsvs throws an error when assayname is not in assayNames", { - expect_error(k_qsvs(covComb_tx_deg, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") + expect_error(k_qsvs(rse_tx, assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") }) # Test when number of rows in 'mod' does not match number of columns in 'rse_tx' test_that("Number of rows in 'mod' does not match number of columns in 'rse_tx'", { mod_not_matching <- mod mod_not_matching <- mod_not_matching[-1, ] - expect_error(k_qsvs(covComb_tx_deg, mod_not_matching, "tpm"), "The number of rows in 'mod' does not match the number of input 'rse_tx' columns.") + expect_error(k_qsvs(rse_tx, mod_not_matching, "tpm"), "The number of rows in 'mod' does not match the number of input 'rse_tx' columns.") }) test_that("non-full rank data throws error", { set.seed(20230621) - expect_error(qSVA(rse_tx = covComb_tx_deg, type = "cell_component", mod = mod2, assayname = "tpm"), "matrix is not full rank") + expect_error(qSVA(rse_tx = rse_tx, type = "cell_component", mod = mod2, assayname = "tpm"), "matrix is not full rank") }) test_that("test that full rank matrix works correctly", { set.seed(20230621) # expect_warning(k_qsvs(rse_tx_low, mod, "tpm"), "Likely due to transcripts being not expressed in most samples") - expect_silent(k_qsvs(covComb_tx_deg, mod, "tpm")) + expect_silent(k_qsvs(rse_tx, mod, "tpm")) }) test_that("test that mod is a matrix", { set.seed(20230621) - expect_error(k_qsvs(covComb_tx_deg, mod = "mod", assayname = "tpm"), "'mod' must be a matrix.") + expect_error(k_qsvs(rse_tx, mod = "mod", assayname = "tpm"), "'mod' must be a matrix.") }) diff --git a/tests/testthat/test-qSVA.R b/tests/testthat/test-qSVA.R index 5a77ec8..31f549d 100644 --- a/tests/testthat/test-qSVA.R +++ b/tests/testthat/test-qSVA.R @@ -1,10 +1,10 @@ -mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(covComb_tx_deg)) +mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN, data = colData(rse_tx)) ## Run qSVA with 2 types set.seed(20230621) -qsva_cc <- qSVA(rse_tx = covComb_tx_deg, type = "cell_component", mod = mod, assayname = "tpm") +qsva_cc <- qSVA(rse_tx = rse_tx, type = "cell_component", mod = mod, assayname = "tpm") set.seed(20230621) -qsva_standard <- qSVA(rse_tx = covComb_tx_deg, type = "standard", mod = mod, assayname = "tpm") +qsva_standard <- qSVA(rse_tx = rse_tx, type = "standard", mod = mod, assayname = "tpm") test_that("number of qsvs is k", { expect_equal(length(colnames(qsva_cc)), 10) @@ -12,7 +12,7 @@ test_that("number of qsvs is k", { }) test_that("length of qsv rownames is the same as number of samples", { - expect_equal(length(rownames(qsva_cc)), length(colnames(covComb_tx_deg))) + expect_equal(length(rownames(qsva_cc)), length(colnames(rse_tx))) }) test_that("output is a matrix", { @@ -25,7 +25,7 @@ test_that("output is an array", { # Test for assayname not in assayNames test_that("qSVA throws an error when assayname is not in assayNames", { - expect_error(qSVA(covComb_tx_deg, type = "standard", mod = mod,assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") + expect_error(qSVA(rse_tx, type = "standard", mod = mod,assayname = "not_in_assayNames"), "'not_in_assayNames' is not in assayNames\\(rse_tx\\).") }) # Test for input is an rse object diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 83cdeb7..2089436 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,26 +1,12 @@ # ENSEMBL names that are not GENCODE or ENSEMBL -# Test for check_tx_names transcripts throw errors if annotation is not GENCODE or ENSEMBL +# Test for which_tx_names transcripts throw errors if annotation is not GENCODE or ENSEMBL test_that("check_tx_names throw error if annotation is not GENCODE or ENSEMBL", { - # For testing the error condition, altered manually rownames of covComb_tx_deg - altered_covComb_tx_deg <- covComb_tx_deg - rownames(altered_covComb_tx_deg) <- paste0("gene", 1:length(rownames(altered_covComb_tx_deg))) + # For testing the error condition, altered manually rownames of rse_tx + altered_rse_tx <- rse_tx + rownames(altered_rse_tx) <- paste0("gene", 1:length(rownames(altered_rse_tx))) sig_transcripts <- select_transcripts("cell_component") - expect_error(check_tx_names(rownames(altered_covComb_tx_deg), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts'), "rownames\\(rse_tx\\)' must use either all GENCODE or all ENSEMBL transcript IDs") - expect_error(check_tx_names(sig_transcripts, rownames(altered_covComb_tx_deg), 'sig_transcripts', 'rownames(rse_tx)'), "rownames\\(rse_tx\\)' must use either all GENCODE or all ENSEMBL transcript IDs") -}) - -# Test for check_tx_names transcripts throw errors if annotation is not mixed GENCODE and ENSEMBL -test_that("Mixed row names annotations throw an error", { - # For testing the error condition, altered manually rownames of covComb_tx_deg - altered_covComb_tx_deg <- covComb_tx_deg - rownames(altered_covComb_tx_deg)[1] <- "ENST00000442987" # Change the first rowname - rownames(altered_covComb_tx_deg)[2] <- "ENST00000623083" # Change the second rowname - sig_transcripts <- select_transcripts("cell_component") - expect_error({ - # Check if mixed row names annotations (ENST and ENST.*?\.) throw an error - check_tx_names(rownames(altered_covComb_tx_deg), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts') - - }, "rownames\\(rse_tx\\)' must use either all GENCODE or all ENSEMBL transcript IDs") + expect_error(which_tx_names(rownames(altered_rse_tx), sig_transcripts)) + expect_error(which_tx_names(sig_transcripts, rownames(altered_rse_tx))) }) diff --git a/vignettes/Intro_qsvaR.Rmd b/vignettes/Intro_qsvaR.Rmd index a2fdc65..e5da174 100644 --- a/vignettes/Intro_qsvaR.Rmd +++ b/vignettes/Intro_qsvaR.Rmd @@ -144,6 +144,9 @@ rse_file <- BiocFileCache::bfcrpath( x = bfc ) load(rse_file, verbose = TRUE) +## keep only adult samples and apply minimum expression cutoff +rse_tx <- rse_tx[, rse_tx$Age > 17 ] +rse_tx <- rse_tx[rowMeans(assays(rse_tx)$tpm) > 0.3, ] ``` ## Get Degradation Matrix @@ -219,11 +222,8 @@ dim(qsvs_wrapper) Next we can use a standard limma package approach to do differential expression on the data. The key here is that we add our qsvs to the `model.matrix()`. Here we input our `RangedSummarizedExperiment` object and our `model.matrix()` with qSVs. Note here that the `RangedSummarizedExperiment` object is the original object loaded with the full list of transcripts, not the the one we subsetted for qSVs. This is because while PCs can be generated from a subset of genes, differential expression is best done on the full dataset. The expected output is a `sigTx` object that shows the results of differential expression. ```{r "perform DE"} -### should be done by cbinding mod to pcs -## subset to an expression cutoff -rse_tx <- rse_tx[rowData(rse_tx)$passExprsCut, ] -# create a model.matrix with demographic infor and qsvs +# create a model.matrix with demographic info and qsvs mod_qSVA <- cbind(mod, qsvs) # log tranform transcript expression