Merge pull request #45 from LieberInstitute/gdev

proposed v1.9.1 updates
LieberInstitute · Oct 9, 2024 · 7a3ec1b · 7a3ec1b
2 parents c8c8ab0 + fd68d34
commit 7a3ec1b
Show file tree

Hide file tree

Showing 38 changed files with 359 additions and 291 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,4 +4,10 @@
 ^README\.Rmd$
 ^\.github$
 ^data-raw$
+^dcs04_data$
+^dcs04_data/.*$
 ^codecov\.yml$
+^\.Rhistory$
+^\.Rdata$
+^\.httr-oauth$
+^\.DS_Store$
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,5 @@
 .Rdata
 .httr-oauth
 .DS_Store
+dcs04_data
 inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: qsvaR
 Title: Generate Quality Surrogate Variable Analysis for Degradation Correction
-Version: 1.9.0
-Date: 2024-05-03
+Version: 1.9.1
+Date: 2024-09-16
 Authors@R: 
     c(
     person("Joshua", "Stolz", email = "jstolz80@gmail.com",
@@ -25,7 +25,7 @@ biocViews: Software, WorkflowStep, Normalization, BiologicalQuestion,
     DifferentialExpression, Sequencing, Coverage
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.0
+RoxygenNote: 7.3.2
 Suggests: 
     BiocFileCache,
     BiocStyle,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,13 +1,14 @@
 # Generated by roxygen2: do not edit by hand
 
 export(DEqual)
-export(check_tx_names)
 export(getDegTx)
 export(getPCs)
 export(get_qsvs)
 export(k_qsvs)
+export(normalize_tx_names)
 export(qSVA)
 export(select_transcripts)
+export(which_tx_names)
 import(SummarizedExperiment)
 import(ggplot2)
 import(rlang)

diff --git a/R/DEqual.R b/R/DEqual.R
@@ -8,11 +8,22 @@
 #' <https://doi.org/10.1016/j.neuron.2019.05.013>. This function compares your
 #' t-statistics of interest computed on transcripts against the
 #' t-statistics from degradation time adjusting for the six brain regions from
-#' degradation experiment data used for determining `covComb_tx_deg`.
+#' degradation experiment data used for determining `rse_tx`.
 #'
-#' @param DE a `data.frame()` with one column containing the t-statistics from
-#' Differential Expression, typically generated with `limma::topTable()`.
-#' The `rownames(DE)` should be transcript GENCODE IDs.
+#' @param DE a `data.frame()` with a column "t" containing the t-statistics
+#' from Differential Expression, typically generated with `limma::topTable()`.
+#' `rownames(DE)` must have transcript Ensembl/Gencode IDs.
+#' @param deg_tstats an optional `data.frame()` with a column "t" containing
+#' t-statistics resulted from a degradation experiment. Default is the
+#' internal `qsvaR::degradation_tstats` from the package authors.
+#' @param show.legend logical (default TRUE) to show legend in the plot
+#' @param show.cor specify where to show the correlation value. Can be one of
+#' "caption", "corner-top", "corner-bottom", or "none".
+#' @param font.size numeric value to set the base font size of the plot
+#' @param cor.size numeric (default font.size/2) to set the font size for the
+#' correlation text
+#' @param cor.label character (default "cor: ") to set the text preceding the
+#'  correlation value
 #'
 #' @return a `ggplot` object of the DE t-statistic vs
 #' the DE statistic from degradation
@@ -36,47 +47,71 @@
 #'
 #' ## Create the DEqual plot
 #' DEqual(random_de)
-DEqual <- function(DE) {
+DEqual <- function(DE, deg_tstats = qsvaR::degradation_tstats, show.legend = TRUE,
+                        show.cor = c('caption','corner-top','corner-bottom','none'),
+                        font.size = 12,  cor.size = font.size/2, cor.label = 'cor: ') {
     ## For R CMD check
     DE_t <- degradation_t <- NULL
-
+    show.cor=rlang::arg_match(show.cor)
     ## Check input
-    # stopifnot("t" %in% colnames(DE))
-    # stopifnot(!is.null(rownames(DE)))
-
-    # Check if input is a dataframe
-    if (!is.data.frame(DE)) { 
-      stop("The input to DEqual is not a dataframe.", call. = FALSE) 
+    if (!is.data.frame(DE)) {
+      stop("The input to DEqual is not a dataframe.", call. = FALSE)
       }
-
     # Check if 't' is in the column names of DE
     if (!("t" %in% colnames(DE))) {
         stop("'t' is not a column in 'DE'.", call. = FALSE)
       }
-
     # Check if DE has non-null row names
     if (is.null(rownames(DE))) {
         stop("Row names of 'DE' are NULL.", call. = FALSE)
-      }
-
+    }
+    if (!missing(deg_tstats)) {
+        if (!is.data.frame(deg_tstats)) {
+            stop("The 'deg_tstats' parameter is not a dataframe.", call. = FALSE)
+        }
+        if (!("t" %in% colnames(deg_tstats))) {
+            stop("'t' is not a column in 'deg_tstats'.", call. = FALSE)
+        }
+        if (is.null(rownames(deg_tstats))) {
+            stop("Row names of 'deg_tstats' are NULL.", call. = FALSE)
+        }
+        if (!all(grepl("^ENST\\d+", rownames(deg_tstats)))) {
+            stop("The row names of 'deg_tstats' must be ENSEMBL or Gencode IDs (ENST...)", call. = FALSE)
+        }
+    }
+
     ## Locate common transcripts
-    deg_tstats = qsvaR::degradation_tstats
-    rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats')
-    common = intersect(rownames(deg_tstats), rownames(DE))
-
+    whichTx <- which_tx_names(rownames(DE),rownames(deg_tstats))
+    common = qsvaR::normalize_tx_names(rownames(DE)[whichTx])
     stopifnot(length(common) > 0)
-
+    rownames(deg_tstats) <- qsvaR::normalize_tx_names(rownames(deg_tstats))
     ## Create dataframe with common transcripts
     common_data <- data.frame(
-        degradation_t = deg_tstats$t[match(common, rownames(deg_tstats))],
-        DE_t = DE$t[match(common, rownames(DE))]
+        degradation_t = deg_tstats[common, "t"],
+        DE_t = DE[whichTx, "t"]
     )
+    cor_val <- signif(cor(common_data[, 1], common_data[, 2]), 2)
     p <- ggplot(common_data, aes(x = DE_t, y = degradation_t)) +
         xlab("DE t-statistic") +
         ylab("Degradation t-statistic") +
-        geom_bin2d(bins = 70) +
+        geom_bin2d(bins = 70, show.legend = show.legend) +
         scale_fill_continuous(type = "viridis") +
         theme_bw() +
-        labs(caption = paste0("correlation: ", signif(cor(common_data[, 1], common_data[, 2]), 2)))
+        theme(text = element_text(size = font.size))
+        # labs(caption = paste0("correlation: ", cor_val)
+    if (show.cor != 'none') {
+      switch(show.cor,
+        'caption' = {
+          p <- p + labs(caption = paste0(cor.label, cor_val))
+        },
+        'corner-top' = {
+          p <- p + annotate("text", x = max(common_data$DE_t), y = max(common_data$degradation_t),
+                            label = paste0(cor.label, cor_val), size = cor.size, hjust = 1, vjust = 1)
+        },
+        'corner-bottom' = {
+          p <- p + annotate("text", x = max(common_data$DE_t), y = min(common_data$degradation_t),
+                            label = paste0(cor.label, cor_val), size = cor.size, hjust = 1, vjust = 0)
+        })
+    }
     return(p)
 }
diff --git a/R/covComb_tx_deg-data.R b/R/covComb_tx_deg-data.R
diff --git a/R/degradation_tstats-data.R b/R/degradation_tstats-data.R
@@ -1,10 +1,10 @@
 #' Degradation time t-statistics
 #'
-#' These t-statistics are derived from the same data that was used for
-#' [covComb_tx_deg]. They are the results from main model where we determined
-#' the relationship with degradation time adjusting for the brain region (so
-#' parallel degradation effects across brain regions). They are used for
-#' plotting in `DEqual()`.
+#' These t-statistics are derived from the degradation timepoints data
+#' built into qsvaR. They are the results from multiple models where
+#' we determined the association of transcripts with degradation time
+#' adjusting for brain region (so parallel degradation effects across
+#' brain regions). They are used for plotting in `DEqual()`.
 #'
 #' @name degradation_tstats
 #' @docType data

diff --git a/R/getDegTx.R b/R/getDegTx.R
@@ -19,8 +19,9 @@
 #' These proportions were then added to our `model.matrix()` and the union of the top 1000 transcripts in the interaction model,
 #' the main effect model, and the cell proportions model were used to generate this model of qSVs.
 #'
-#' @param assayname character string specifying the name of the assay desired in rse_tx
 #' @param sig_transcripts A list of transcripts determined to have degradation signal in the qsva expanded paper.
+#' @param assayname character string specifying the name of the assay desired in rse_tx
+#' @param verbose specify if the function should report how many model transcripts were matched
 #'
 #' @return A
 #'  [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
@@ -31,28 +32,38 @@
 #' @import rlang
 #'
 #' @examples
-#' getDegTx(covComb_tx_deg)
-#' stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1)
-getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = select_transcripts(type), assayname = "tpm") {
-
-  type = arg_match(type)
-
+#' degTx <- getDegTx(rse_tx, "standard")
+getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"),
+                     sig_transcripts = NULL, assayname = "tpm", verbose = TRUE) {
+
+  #type = arg_match(type)
+  if (is.null(sig_transcripts)) {
+    type = arg_match(type)
+    sig_transcripts <- select_transcripts(type)
+  } else {
+    type = "custom"
+  }
   # Validate rse_tx is a RangedSummarizedExperiment object
   if (!is(rse_tx, "RangedSummarizedExperiment")) {
     stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE)
   }
-  
+
   # Check if assayname is in assayNames
   if (!assayname %in% assayNames(rse_tx)) {
     stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE)
   }
-
-  # Check for validity and matching of tx names
-  sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts')
-
-  # Subset rse_tx to include sig_transcripts
-  rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE]
-
+
+  # Check for validity and matching of tx names and return the tx subset indexes in rse_tx
+  wtx <- which_tx_names(rownames(rse_tx), sig_transcripts)
+  if (length(wtx) == 0) {
+    stop("No transcripts found in the '",type, "' degradation model transcripts" )
+  }
+
+  if (verbose) {
+      message("   '",type,"' degradation model transcripts found: ", length(wtx))
+  }
+  rse_tx <- rse_tx[wtx, , drop = FALSE]
+
   # Check if the row means is greater than 1
   if (mean(rowMeans(assays(rse_tx)[[assayname]])) < 1) {
     warning("The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.")

diff --git a/R/getPCs.R b/R/getPCs.R
@@ -11,7 +11,7 @@
 #' @importFrom stats prcomp
 #' @import SummarizedExperiment
 #' @examples
-#' getPCs(covComb_tx_deg, "tpm")
+#' getPCs(rse_tx, "tpm")
 getPCs <- function(rse_tx, assayname = "tpm") {
 
   # Validate rse_tx is a RangedSummarizedExperiment object

diff --git a/R/get_qsvs.R b/R/get_qsvs.R
@@ -12,7 +12,7 @@
 #' @export
 #'
 #' @examples
-#' qsv <- getPCs(covComb_tx_deg, "tpm")
+#' qsv <- getPCs(rse_tx, "tpm")
 #' get_qsvs(qsv, 2)
 get_qsvs <- function(qsvPCs, k) {
 

diff --git a/R/k_qsvs.R b/R/k_qsvs.R
@@ -14,18 +14,18 @@
 #' @import SummarizedExperiment
 #' @examples
 #' ## First we need to define a statistical model. We'll use the example
-#' ## covComb_tx_deg data. Note that the model you'll use in your own data
+#' ## rse_tx data. Note that the model you'll use in your own data
 #' ## might look different from this model.
 #' mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN,
-#'     data = colData(covComb_tx_deg)
+#'     data = colData(rse_tx)
 #' )
 #'
 #' ## To ensure that the results are reproducible, you will need to set a
 #' ## random seed with the set.seed() function. Internally, we are using
 #' ## sva::num.sv() which needs a random seed to ensure reproducibility of the
 #' ## results.
 #' set.seed(20230621)
-#' k_qsvs(covComb_tx_deg, mod, "tpm")
+#' k_qsvs(rse_tx, mod, "tpm")
 k_qsvs <- function(rse_tx, mod, assayname) {
 
   # Validate rse_tx is a RangedSummarizedExperiment object

diff --git a/R/qSVA.R b/R/qSVA.R
@@ -3,14 +3,14 @@
 #' @param rse_tx A [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class] object containing
 #' the transcript data desired to be studied.
 #' @param type a character string specifying which model you would
-#'  like to use when selecting a degradation matrix.
-#' @param sig_transcripts A list of transcripts that are associated with
-#' degradation signal. Use `select_transcripts()` to select sets of transcripts
-#' identified by the qSVA expanded paper. Specifying a `character()` input of
-#' ENSEMBL transcript IDs (or whatever values you have at `rownames(rse_tx)`)
-#' obtained outside of `select_transcripts()` overrides
-#' the user friendly `type` argument. That is, this argument provides more fine
-#' tuning options for advanced users.
+#'  like to use from the sets of signature transcripts identified
+#'  by the qsvaR package. This can be omitted if a custom set of
+#'  transcripts is provided to sig_transcripts.
+#' @param sig_transcripts A list of transcript IDs that are associated
+#'  with degradation signal. Specifying a `character()` input with ENSEMBL
+#' transcript IDs (whose values should match entries in `rownames(rse_tx)`).
+#' This argument provides a custom list of transcripts for adjusting
+#' for degradation; this should be used instead of the `type` argument.
 #' @param mod Model Matrix with necessary variables the you would
 #'  model for in differential expression
 #' @param assayname character string specifying the name of
@@ -21,43 +21,40 @@
 #'
 #' @examples
 #' ## First we need to define a statistical model. We'll use the example
-#' ## covComb_tx_deg data. Note that the model you'll use in your own data
+#' ## rse_tx data. Note that the model you'll use in your own data
 #' ## might look different from this model.
 #' mod <- model.matrix(~ mitoRate + Region + rRNA_rate + totalAssignedGene + RIN,
-#'     data = colData(covComb_tx_deg)
+#'     data = colData(rse_tx)
 #' )
 #'
 #' ## To ensure that the results are reproducible, you will need to set a
 #' ## random seed with the set.seed() function. Internally, we are using
 #' ## sva::num.sv() which needs a random seed to ensure reproducibility of the
 #' ## results.
 #' set.seed(20230621)
-#' qSVA(rse_tx = covComb_tx_deg, type = "cell_component", mod = mod, assayname = "tpm")
+#' qSVA(rse_tx = rse_tx, type = "cell_component", mod = mod, assayname = "tpm")
 #'
 qSVA <-
-    function(rse_tx,
-    type = c("cell_component", "standard", "top1500"),
-    sig_transcripts = select_transcripts(type),
-    mod,
-    assayname) {
-        ## We don't need to pass type to getDegTx() since it's not used internally
-        ## once the sig_transcripts have been defined.
-
-      type = arg_match(type)
-
+    function(rse_tx,  type = c("cell_component", "standard", "top1500"),
+                      sig_transcripts = NULL,    mod,   assayname) {
+
+      if (is.null(sig_transcripts)) {
+        type = arg_match(type) # must be one of those in the list if sig_transcripts is NULL
+      }
+
       # Validate rse_tx is a RangedSummarizedExperiment object
       if (!is(rse_tx, "RangedSummarizedExperiment")) {
         stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE)
       }
-      
+
       # Check if assayname is in assayNames
       if (!assayname %in% assayNames(rse_tx)) {
         stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE)
       }
-      
+
       # Get the qSVs
       DegTx <-
-            getDegTx(rse_tx, sig_transcripts = sig_transcripts, assayname = assayname)
+            getDegTx(rse_tx, type = type, sig_transcripts = sig_transcripts, assayname = assayname)
         PCs <- getPCs(DegTx, assayname)
         k <- k_qsvs(DegTx, mod = mod, assayname = assayname)
         qSV <- get_qsvs(PCs, k)