diff --git a/DESCRIPTION b/DESCRIPTION index d4c9df46..3d86a5d9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,13 +15,15 @@ Authors@R: c(person(given = "Laurent", family = "Gatto", email = "mail@sebastiangibb.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7406-4443")), - person(given = "Sigurdur", family = "Smarason", role = "ctb") + person(given = "Sigurdur", family = "Smarason", role = "ctb"), + person(given = "Thomas", family = "Naake", role = "ctb") ) Author: Laurent Gatto, Johannes Rainer and Sebastian Gibb. Maintainer: Laurent Gatto Depends: R (>= 3.6.0) Imports: + igraph, methods, S4Vectors Suggests: diff --git a/NAMESPACE b/NAMESPACE index b6ed691d..6283e17e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,12 +10,15 @@ export(coefMA) export(coefSG) export(coefWMA) export(common) +export(dotproduct) +export(graphPeaks) export(join) export(localMaxima) export(noise) export(ppm) export(rbindFill) export(refineCentroids) +export(shiftMatrix) export(smooth) export(valleys) export(vapply1c) @@ -24,6 +27,8 @@ export(vapply1l) importClassesFrom(S4Vectors,Rle) importFrom(S4Vectors,Rle) importFrom(S4Vectors,nrun) +importFrom(igraph,components) +importFrom(igraph,graph_from_adjacency_matrix) importFrom(methods,as) importFrom(methods,is) importFrom(stats,filter) diff --git a/NEWS.md b/NEWS.md index 5b1b34f5..9eab4988 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ - Add `asInteger` and `rbindFill`. - Add `asRle`, `asRleDataFrame` and `asVectorDataFrame`. +- Add `dotproduct` ## MsCoreUtils 0.0.1 diff --git a/R/dotproduct.R b/R/dotproduct.R new file mode 100644 index 00000000..d6e5a8b4 --- /dev/null +++ b/R/dotproduct.R @@ -0,0 +1,102 @@ +#' @title Calculate the normalized dot product +#' +#' @description +#' Calculate the normalized dot product (NDP).`dotproduct` returns a numeric +#' value ranging between 0 and 1, where 0 indicates no similarity between the +#' two MS/MS features, while 1 indicates that the MS/MS features are identical. +#' +#' @param +#' x `matrix` with two column where one contains m/z values (column `"mz"`) and +#' the second corresponding intensity values (column `"intensity"`) +#' +#' @param +#' y `matrix` with two column where one contains m/z values (column `"mz"`) and +#' the second corresponding intensity values (column `"intensity"`) +#' +#' @param m `numeric(1)`, exponent for peak intensity-based weights +#' +#' @param n `numeric(1)`, exponent for m/z-based weights +#' +#' @details +#' Each row in `x` corresponds to the respective row in `y`, i.e. the peaks +#' (entries `"mz"`) per spectrum have to match. +#' +#' `m` and `n` are weights given on the peak intensity and the m/z values +#' respectively. As default (`m = 0.5`), the square root of the intensity +#' values are taken to calculate weights. With increasing values for `m`, high +#' intensity values become more important for the similarity calculation, +#' i.e. the differences between intensities will be aggravated. +#' With increasing values for `n`, high m/z values will be taken more into +#' account for similarity calculation. Especially when working with small +#' molecules, a value `n > 0` can be set, to give a weight on the m/z values to +#' accommodate that shared fragments with higher m/z are less likely and will +#' mean that molecules might be more similar. If `n != 0`, a warning will be +#' raised if the corresponding m/z values are not identical, since small +#' differences in m/z values will distort the similarity values with increasing +#' `n`. If `m=0` or `n=0`, intensity values or m/z values, respectively, are not +#' taken into account. +#' +#' The normalized dot product is calculated according to: +#' \deqn{NDP = \frac{\sum(W_{S1, i} \cdot W_{S2, i}) ^ 2}{ \sum(W_{S1, i} ^ 2) * \sum(W_{S2, i} ^ 2) }}{\sum(W_{S1, i} \cdot W_{S2, i}) ^ 2 \sum(W_{S1, i} ^ 2) * \sum(W_{S2, i} ^ 2)}, +#' with \eqn{W = [ peak intensity] ^{m} \cdot [m/z]^n}. +#' For further information on normalized dot product see for example +#' Li et al. (2015). +#' Prior to calculating \deqn{W_{S1}} or \deqn{W_{S2}}, all intensity values +#' are divided by the maximum intensity value and multiplied by 100. +#' +#' @references +#' Li et al. (2015): Navigating natural variation in herbivory-induced +#' secondary metabolism in coyote tobacco populations using MS/MS structural +#' analysis. PNAS, E4147--E4155, DOI: 10.1073/pnas.1503106112. +#' +#' @return +#' `numeric(1)`, `dotproduct` returns a numeric similarity coefficient between +#' 0 and 1. +#' +#' @author Thomas Naake, \email{thomasnaake@@googlemail.com} +#' +#' @examples +#' x <- matrix(c(c(100.002, 100.001, NA, 300.01, 300.02, NA), +#' c(2, 1.5, 0, 1.2, 0.9, 0)), ncol = 2,) +#' y <- matrix(c(c(100.0, NA, 200.0, 300.002, 300.025, 300.0255), +#' c(2, 0, 3, 1, 4, 0.4)), ncol = 2) +#' colnames(x) <- colnames(y) <- c("mz", "intensity") +#' dotproduct(x, y, m = 0.5, n = 0) +#' +#' @export +dotproduct <- function(x, y, m = 0.5, n = 0) { + + ## check valid input + if (!is.matrix(x)) stop("'x' is not a matrix") + if (!is.matrix(y)) stop("'y' is not a matrix") + + if (nrow(x) != nrow(y)) stop("nrow(x) and nrow(y) are not identical") + if (!is.numeric(m) || length(m) != 1) + stop("`m` has to be a numeric of length 1.") + if (!is.numeric(n) || length(n) != 1) + stop("`n` has to be a numeric of length 1.") + + ## retrieve m/z and intensity from x and y + mz1 <- x[, "mz"] + mz2 <- y[, "mz"] + inten1 <- x[, "intensity"] + inten2 <- y[, "intensity"] + + ## check mz values: if mz1 and mz2 are not identical and the values are + ## weighted by n, this might to unexpected results in the similarity + ## calculation + if (n && any(mz1 != mz2, na.rm = TRUE)) + warning("m/z values in `x` and `y` are not identical. ", + "For n != 0 this might yield unexpected results.") + + inten1 <- inten1 / max(inten1, na.rm = TRUE) * 100 + inten2 <- inten2 / max(inten2, na.rm = TRUE) * 100 + + ws1 <- inten1 ^ m * mz1 ^ n + ws2 <- inten2 ^ m * mz2 ^ n + + ## calculate normalized dot product + dp <- sum(ws1 * ws2, na.rm = TRUE) + dp ^ 2 / (sum(ws1 ^ 2, na.rm = TRUE) * sum(ws2 ^ 2, na.rm = TRUE)) +} + diff --git a/R/graphPeaks.R b/R/graphPeaks.R new file mode 100644 index 00000000..581f9e9d --- /dev/null +++ b/R/graphPeaks.R @@ -0,0 +1,337 @@ +#' @importFrom igraph graph_from_adjacency_matrix components + +#' @name graphPeaks +#' +#' @title +#' Match two spectra using bipartite networks and combinatorics +#' +#' @description +#' Function to match spectrum objects using bipartite networks and combinations +## to match peaks. `graphPeaks` takes two objects, `x` and +#' `y` as input that contain spectral information. The matching +#' is a multi-step procedure: +#' 1) filtering based on `ppm`, +#' 2) retain order of matches between features of `x` and +#' `y` (remove crossing edges that violate the order of matching +#' m/z), +#' 3) calculate all combinations of the remaining possibilities. +#' +#' @param +#' x `matrix`, the first column (`"mz"`) contains m/z value and the +#' second column (`"intensity"`) contains the corresponding intensity values +#' +#' @param +#' y `matrix`, the first column (`"mz"`) contains m/z value and the +#' second column (`"intensity"`) contains the corresponding intensity values +#' +#' @param +#' ppm `numeric`, tolerance parameter in ppm to match corresponding peaks +#' +#' @param +#' FUN function to calculate similarity between spectra +#' +#' @param +#' ... additional parameters passed to `FUN` +#' +#' @details +#' Objective function is highest similarites between the two +#' spectral objects, i.e. `FUN` is calculated over all combinations and +#' the similarity of the combination that yields the highest similarity is +#' returned. +#' +#' @return +#' list with elements `x` and `y` each being a matrix with columns `"mz"` +#' and `"intensity"`. Each row (peak) in `x` matches the row (peak) in `y` +#' +#' @author +#' Thomas Naake \email{thomasnaake@@googlemail.com} +#' +#' @examples +#' x <- matrix(c(c(100.001, 100.002, 300.01, 300.02), +#' c(1, 1, 1, 1)), ncol = 2, nrow = 4, byrow = FALSE) +#' colnames(x) <- c("mz", "intensity") +#' +#' y <- matrix(c(c(100.0, 200.0, 300.002, 300.025, 300.0255), +#' c(1, 1, 1, 1, 1)), ncol = 2, nrow = 5, byrow = FALSE) +#' colnames(y) <- c("mz", "intensity") +#' graphPeaks(x = x, y = y, ppm = 20, FUN = dotproduct) +#' @export +graphPeaks <- function(x, y, ppm = 20, FUN = dotproduct, ...) { + + if (!is.matrix(x)) stop("x is not a matrix") + if (!is.matrix(y)) stop("y is not a matrix") + if (mode(x) != "numeric") stop("mode(x) is not 'numeric'") + if (mode(y) != "numeric") stop("mode(y) is not 'numeric'") + if (!all(colnames(x) %in% c("mz", "intensity"))) stop("colnames(x) are not 'mz' and 'intensity'") + if (!all(colnames(y) %in% c("mz", "intensity")))stop("colnames(y) are not 'mz' and 'intensity'") + if (!is.numeric(ppm)) stop("ppm is not numeric") + if (ppm < 0) stop("ppm has to be positive") + + + ## re-set colnames for x and y and order + rownames(x) <- paste("sp1_", 1:nrow(x),sep = "") + rownames(y) <- paste("sp2_", 1:nrow(y),sep = "") + if (nrow(x) > 1) x <- x[order(x[, 1]), ] + if (nrow(y) > 1) y <- y[order(y[, 1]), ] + + ## 1) create adjacency matrix and remove edges within x and + ## within y + w <- matrix(1, ncol = nrow(x) + nrow(y), + nrow = nrow(x) + nrow(y), + dimnames = list(c(rownames(x), rownames(y)), + c(rownames(x), rownames(y)) + )) + + ## 2) remove edges that are not in a certain range + ## calculate upper and lower m/z based on ppm parameter + ppm_1_1 <- x[, 1] / abs(ppm / 10 ^ 6 - 1) + ppm_1_2 <- x[, 1] / abs(ppm / 10 ^ 6 + 1) + ppm_2_1 <- y[, 1] / abs(ppm / 10 ^ 6 - 1) + ppm_2_2 <- y[, 1] / abs(ppm / 10 ^ 6 + 1) + + ## set names + names(ppm_1_1) <- rownames(x) + names(ppm_1_2) <- rownames(x) + names(ppm_2_1) <- rownames(y) + names(ppm_2_2) <- rownames(y) + + mat1 <- apply(as.matrix(ppm_1_2), 1, function(a) a <= c(ppm_1_1, ppm_2_1)) + mat2 <- apply(as.matrix(ppm_1_1), 1, function(a) a >= c(ppm_1_2, ppm_2_2)) + + link_ppm <- mat1 * mat2 + w[rownames(link_ppm), colnames(link_ppm)] <- link_ppm + w[colnames(link_ppm), rownames(link_ppm)] <- t(link_ppm) + w[rownames(x), rownames(x)] <- 0 + w[rownames(y), rownames(y)] <- 0 + + ## obtain network components from w + net <- graph_from_adjacency_matrix(w, weighted = NULL, mode = "undirected") + comp <- components(net) + + ## 3) get all possible combinations within one component + ## res will contain all combinations within one component + res <- vector("list", length(comp$csize)) + + ## write to res combinations where csize == 2 + inds_1 <- which(comp$csize == 1) + res[inds_1] <- lapply(inds_1, function(a) { + ms <- names(which(comp$membership == a)) + if (grepl(x = ms, pattern = "sp1")) { + ms <- c(ms, NA) + } else{ + ms <- c(NA, ms) + } + matrix(ms, ncol = 2) + }) + ## write to res combinations where csize == 2 + inds_2 <- which(comp$csize == 2) + res[inds_2] <- lapply(inds_2, function(a) matrix(names(which(comp$membership == a)), ncol = 2)) + + ## get combinations where csize > 2 + inds <- which(comp$csize > 2) + + for (i in inds) { + + ## separate component and create two matrices from x and + ## y that only contain component menbers + names_ind_i <- names(which(comp$membership == i)) + x_ind <- x[names_ind_i[names_ind_i %in% rownames(x)], ] + x_ind <- matrix(x_ind, ncol = 2, + dimnames = list(names_ind_i[names_ind_i %in% rownames(x)], c("", ""))) + y_ind <- y[names_ind_i[names_ind_i %in% rownames(y)], ] + y_ind <- matrix(y_ind, ncol = 2, + dimnames = list(names_ind_i[names_ind_i %in% rownames(y)], c("", ""))) + + ## allocate to c1 and c2 the names (colnames of x and y + ## that are in the specific component) + if (nrow(x_ind) < nrow(y_ind)) { + c1 <- rownames(x_ind); c2 <- rownames(y_ind) + c1 <- c(c1, rep("NA", length(c2)-length(c1))) + c1_c2 <- lapply(c1, function(a) expand.grid(a, c2)) + } else { + c1 <- rownames(y_ind); c2 <- rownames(x_ind) + c1 <- c(c1, rep("NA", length(c2)-length(c1))) + c1_c2 <- lapply(c2, function(a) expand.grid(a, c1)) + } + + c1_c2_paste <- lapply(c1_c2, function(a) apply(a, 1, + function(b) paste(b, collapse = " & "))) + + ## calculate all possible combinations + res_i <- as.matrix(expand.grid(c1_c2_paste, stringsAsFactors = FALSE)) + ## write rows to list + res_i <- split(res_i, row(res_i)) + ## strsplit " & ", unlist and write to matrix + res_i <- lapply(res_i, strsplit, split = " & ") + res_i <- lapply(res_i, unlist) + res_i <- matrix(unlist(res_i), nrow = length(res_i), byrow = TRUE) + + ## remove rows that contain duplicated values + res_i <- res_i[!apply(apply(res_i, 1, duplicated), 2, any), ] + + ## filtering for crossing matching: retain order of m/z + seqs <- seq(2, ncol(res_i), by = 2) + if (ncol(res_i) > 2) { + ## do if there is only a multiple mapping: + ## check order of sp2s, they have to ascend, remove those that + ## do not ascend + crosses <- lapply( + as.data.frame(t(res_i[, seqs]), stringsAsFactors = FALSE), + function(a) as.numeric(substr(a, 5, nchar(a)))) + ## check where all are TRUE, remove before NAs + crosses <- lapply(crosses, function(a) { + a <- a[!is.na(a)] + all(a == sort(a)) + }) + res_i <- matrix(res_i[unlist(crosses), ], ncol = ncol(res_i)) + + ## create shifted matrices, do not use last element since it contains + ## only NAs + shift_right <- lapply(seqs[-length(seqs)], + function(a) shiftMatrix(res_i, seqs, a / 2)) + shift_left <- lapply(seqs[-length(seqs)], + function(a) shiftMatrix(res_i, seqs, -a / 2)) + + ## write to matrix + mat_shift_left <- lapply(shift_left, function(a) { + res_i[, seqs] <- a + return(res_i) + }) + mat_shift_right <- lapply(shift_right, function(a) { + res_i[, seqs] <- a + return(res_i) + }) + + ## rbind the lists + mat_shift_left <- do.call(rbind, mat_shift_left) + mat_shift_right <- do.call(rbind, mat_shift_right) + + ## the ones that are shifted out, link to NA and bind to NA + ## for mat_shift_left (take from shift_right and inverse) + add_left <- shift_right[length(shift_right):1] + add_left <- lapply(add_left, function(x) x[, -1]) + + mat_add_left <- mat_add_right <- matrix("NA", + ncol = length(shift_left) * 2, nrow = nrow(mat_shift_left)) + mat_add_left[, seqs[-length(seqs)]] <- unlist(add_left) + + ## for mat_shift_right (take from shift_left and inverse) + add_right <- shift_left[length(shift_left):1] + add_right <- lapply(add_right, + function(x) matrix(x[, ncol(x):1], ncol = ncol(x))) + add_right <- lapply(add_right, function(x) x[, -1]) + mat_add_right[, seqs[-length(seqs)]] <- unlist(add_right) + + ## cbind with mat_add_left and mat_add_right + mat_shift_left <- cbind(mat_shift_left, mat_add_left) + mat_shift_right <- cbind(mat_shift_right, mat_add_right) + + ## add to res_i + res_i <- cbind(res_i, matrix("NA", nrow = nrow(res_i), ncol = ncol(mat_add_left))) + + ## assign to res + res[[i]] <- rbind(res_i, mat_shift_left, mat_shift_right) + } else { + res[[i]] <- res_i + } + } + + ## create combinations between rows + res_paste <- lapply(res, function(a) apply(a, 1, paste, collapse = " & ")) + + ## 4) calculate all possible combinations between the components + res_exp <- as.matrix(expand.grid(res_paste, stringsAsFactors = FALSE)) + ## write rows to list + res_exp <- split(res_exp, row(res_exp)) + ## strsplit " & ", unlist and write to matrix + res_exp <- lapply(res_exp, strsplit, split = " & ") + res_exp <- lapply(res_exp, unlist) + res_exp <- matrix(unlist(res_exp), nrow = length(res_exp), byrow = TRUE) + + ## 5) go through every row and calculate score: row with highest score is + ## the best match + sim <- apply(res_exp, 1, function(a) { + sp1_ind <- a[seq(1, ncol(res_exp), 2)] + sp2_ind <- a[seq(2, ncol(res_exp), 2)] + + ## remove elements when two "NA" are at the same position + ## (they were created artificially in the above step when pushing + ## sp1/sp2 out and cbinding the pushed out from the matrix) + ind_remove <- sp1_ind == "NA" & sp2_ind == "NA" + sp1_ind <- sp1_ind[!ind_remove] + sp2_ind <- sp2_ind[!ind_remove] + + ## create vectors that store mz and intensity of combination + mz1 <- rep(NA, length(sp1_ind)) + int1 <- numeric(length(sp1_ind)) + mz2 <- rep(NA, length(sp2_ind)) + int2 <- numeric(length(sp2_ind)) + + mz1[sp1_ind != "NA"] <- x[sp1_ind[sp1_ind != "NA"], "mz"] + mz2[sp2_ind != "NA"] <- y[sp2_ind[sp2_ind != "NA"], "mz"] + int1[sp1_ind != "NA"] <- x[sp1_ind[sp1_ind != "NA"], "intensity"] + int2[sp2_ind != "NA"] <- y[sp2_ind[sp2_ind != "NA"], "intensity"] + + ## create matrices of matched spectra + sp1 <- matrix(c(mz1, int1), ncol = 2) + sp2 <- matrix(c(mz2, int2), ncol = 2) + colnames(sp1) <- colnames(sp2) <- c("mz", "intensity") + + ## get similarity value from function FUN + value <- FUN(sp1, sp2, ...) + + list(value = value, x = sp1, y = sp2) + + }) + sim_value <- unlist(lapply(sim, "[[", "value")) + sim_max <- sim[[which.max(sim_value)]] + + # ## sort according to ascending order + x_max <- as.matrix(sim_max[["x"]]) + y_max <- as.matrix(sim_max[["y"]]) + + sort_x_y <- lapply(seq_len(nrow(x_max)), function(a) + paste(sort(c(x_max[a, "mz"], y_max[a, "mz"])), collapse = "_") + ) + sort_x_y <- order(unlist(sort_x_y)) + x_max <- x_max[sort_x_y, ] + y_max <- y_max[sort_x_y, ] + + list(x = x_max, y = y_max) + +} + +#' @name shiftMatrix +#' @title Shift columns of a matrix by n and set added columns to def +#' @description `shiftMatrix` shifts columns of a matrix by `n` and +#' sets the added columns to `def`. +#' @usage shiftMatrix(mat, x, n, def = NA) +#' @param mat `matrix` +#' @param x `numeric`, col indices to shift +#' @param n `numeric(1)`, gives the number by how many positions the columns +#' `x` of `mat` are shifted +#' @param def `character(1)`/`numeric(1)`, replacement value for added columns +#' @details helper function for `graphPeaks` +#' @return matrix with all combinations of shifted rows, only returns the +#' columns `x` of `mat` +#' @author Thomas Naake \email{thomasnaake@@googlemail.com} +#' @examples +#' mat <- matrix(letters[1:18], ncol = 6, nrow = 3) +#' x <- c(2, 4, 6) +#' shiftMatrix(mat = mat, x = x, n = 1, def = NA) +#' @export +shiftMatrix <- function(mat, x, n, def = NA){ + if (n == 0) { res <- mat } + if (n < 0) { + n <- abs(n) + res <- mat[, x[ seq_len(length(x) - n) + n ]] + res <- cbind(matrix(res, nrow = nrow(mat), byrow = FALSE), + matrix(def, nrow = nrow(mat), ncol = n)) + } else { + res <- mat[, x[ seq_len(length(x) - n) ]] + res <- cbind(matrix(def, nrow = nrow(mat), ncol = n), + matrix(res, nrow(mat), byrow = FALSE)) + } + return(res) +} + diff --git a/README.md b/README.md index e44d0098..0957b381 100644 --- a/README.md +++ b/README.md @@ -18,4 +18,5 @@ Currently we externalising core functions from `MSnbase`, `MALDIquant`, etc. # Contributions - Sigurdur Smarason (@SiggiSmara): weighted moving average (https://github.com/sgibb/MALDIquant/pull/54) +- Thomas Naake (@tnaake): dotproduct calculation (https://github.com/rformassspectrometry/MsCoreUtils/pull/17), graphPeaks function diff --git a/man/dotproduct.Rd b/man/dotproduct.Rd new file mode 100644 index 00000000..f94e6a96 --- /dev/null +++ b/man/dotproduct.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dotproduct.R +\name{dotproduct} +\alias{dotproduct} +\title{Calculate the normalized dot product} +\usage{ +dotproduct(x, y, m = 0.5, n = 0) +} +\arguments{ +\item{x}{\code{matrix} with two column where one contains m/z values (column \code{"mz"}) and +the second corresponding intensity values (column \code{"intensity"})} + +\item{y}{\code{matrix} with two column where one contains m/z values (column \code{"mz"}) and +the second corresponding intensity values (column \code{"intensity"})} + +\item{m}{\code{numeric(1)}, exponent for peak intensity-based weights} + +\item{n}{\code{numeric(1)}, exponent for m/z-based weights} +} +\value{ +\code{numeric(1)}, \code{dotproduct} returns a numeric similarity coefficient between +0 and 1. +} +\description{ +Calculate the normalized dot product (NDP).\code{dotproduct} returns a numeric +value ranging between 0 and 1, where 0 indicates no similarity between the +two MS/MS features, while 1 indicates that the MS/MS features are identical. +} +\details{ +Each row in \code{x} corresponds to the respective row in \code{y}, i.e. the peaks +(entries \code{"mz"}) per spectrum have to match. + +\code{m} and \code{n} are weights given on the peak intensity and the m/z values +respectively. As default (\code{m = 0.5}), the square root of the intensity +values are taken to calculate weights. With increasing values for \code{m}, high +intensity values become more important for the similarity calculation, +i.e. the differences between intensities will be aggravated. +With increasing values for \code{n}, high m/z values will be taken more into +account for similarity calculation. Especially when working with small +molecules, a value \code{n > 0} can be set, to give a weight on the m/z values to +accommodate that shared fragments with higher m/z are less likely and will +mean that molecules might be more similar. If \code{n != 0}, a warning will be +raised if the corresponding m/z values are not identical, since small +differences in m/z values will distort the similarity values with increasing +\code{n}. If \code{m=0} or \code{n=0}, intensity values or m/z values, respectively, are not +taken into account. + +The normalized dot product is calculated according to: +\deqn{NDP = \frac{\sum(W_{S1, i} \cdot W_{S2, i}) ^ 2}{ \sum(W_{S1, i} ^ 2) * \sum(W_{S2, i} ^ 2) }}{\sum(W_{S1, i} \cdot W_{S2, i}) ^ 2 \sum(W_{S1, i} ^ 2) * \sum(W_{S2, i} ^ 2)}, +with \eqn{W = [ peak intensity] ^{m} \cdot [m/z]^n}. +For further information on normalized dot product see for example +Li et al. (2015). +Prior to calculating \deqn{W_{S1}} or \deqn{W_{S2}}, all intensity values +are divided by the maximum intensity value and multiplied by 100. +} +\examples{ +x <- matrix(c(c(100.002, 100.001, NA, 300.01, 300.02, NA), + c(2, 1.5, 0, 1.2, 0.9, 0)), ncol = 2,) +y <- matrix(c(c(100.0, NA, 200.0, 300.002, 300.025, 300.0255), + c(2, 0, 3, 1, 4, 0.4)), ncol = 2) +colnames(x) <- colnames(y) <- c("mz", "intensity") +dotproduct(x, y, m = 0.5, n = 0) + +} +\references{ +Li et al. (2015): Navigating natural variation in herbivory-induced +secondary metabolism in coyote tobacco populations using MS/MS structural +analysis. PNAS, E4147--E4155, DOI: 10.1073/pnas.1503106112. +} +\author{ +Thomas Naake, \email{thomasnaake@googlemail.com} +} diff --git a/man/graphPeaks.Rd b/man/graphPeaks.Rd new file mode 100644 index 00000000..3e5e254b --- /dev/null +++ b/man/graphPeaks.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/graphPeaks.R +\name{graphPeaks} +\alias{graphPeaks} +\title{Match two spectra using bipartite networks and combinatorics} +\usage{ +graphPeaks(x, y, ppm = 20, FUN = dotproduct, ...) +} +\arguments{ +\item{x}{\code{matrix}, the first column (\code{"mz"}) contains m/z value and the +second column (\code{"intensity"}) contains the corresponding intensity values} + +\item{y}{\code{matrix}, the first column (\code{"mz"}) contains m/z value and the +second column (\code{"intensity"}) contains the corresponding intensity values} + +\item{ppm}{\code{numeric}, tolerance parameter in ppm to match corresponding peaks} + +\item{FUN}{function to calculate similarity between spectra} + +\item{...}{additional parameters passed to \code{FUN}} +} +\value{ +list with elements \code{x} and \code{y} each being a matrix with columns \code{"mz"} +and \code{"intensity"}. Each row (peak) in \code{x} matches the row (peak) in \code{y} +} +\description{ +Function to match spectrum objects using bipartite networks and combinations +\code{y} as input that contain spectral information. The matching +is a multi-step procedure: +\enumerate{ +\item filtering based on \code{ppm}, +\item retain order of matches between features of \code{x} and +\code{y} (remove crossing edges that violate the order of matching +m/z), +\item calculate all combinations of the remaining possibilities. +} +} +\details{ +Objective function is highest similarites between the two +spectral objects, i.e. \code{FUN} is calculated over all combinations and +the similarity of the combination that yields the highest similarity is +returned. +} +\examples{ +x <- matrix(c(c(100.001, 100.002, 300.01, 300.02), + c(1, 1, 1, 1)), ncol = 2, nrow = 4, byrow = FALSE) +colnames(x) <- c("mz", "intensity") + +y <- matrix(c(c(100.0, 200.0, 300.002, 300.025, 300.0255), + c(1, 1, 1, 1, 1)), ncol = 2, nrow = 5, byrow = FALSE) +colnames(y) <- c("mz", "intensity") +graphPeaks(x = x, y = y, ppm = 20, FUN = dotproduct) +} +\author{ +Thomas Naake \email{thomasnaake@googlemail.com} +} diff --git a/man/shiftMatrix.Rd b/man/shiftMatrix.Rd new file mode 100644 index 00000000..374f3a70 --- /dev/null +++ b/man/shiftMatrix.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/graphPeaks.R +\name{shiftMatrix} +\alias{shiftMatrix} +\title{Shift columns of a matrix by n and set added columns to def} +\usage{ +shiftMatrix(mat, x, n, def = NA) +} +\arguments{ +\item{mat}{\code{matrix}} + +\item{x}{\code{numeric}, col indices to shift} + +\item{n}{\code{numeric(1)}, gives the number by how many positions the columns +\code{x} of \code{mat} are shifted} + +\item{def}{\code{character(1)}/\code{numeric(1)}, replacement value for added columns} +} +\value{ +matrix with all combinations of shifted rows, only returns the +columns \code{x} of \code{mat} +} +\description{ +\code{shiftMatrix} shifts columns of a matrix by \code{n} and +sets the added columns to \code{def}. +} +\details{ +helper function for \code{graphPeaks} +} +\examples{ +mat <- matrix(letters[1:18], ncol = 6, nrow = 3) +x <- c(2, 4, 6) +shiftMatrix(mat = mat, x = x, n = 1, def = NA) +} +\author{ +Thomas Naake \email{thomasnaake@googlemail.com} +} diff --git a/tests/testthat/test_dotproduct.R b/tests/testthat/test_dotproduct.R new file mode 100644 index 00000000..d6761710 --- /dev/null +++ b/tests/testthat/test_dotproduct.R @@ -0,0 +1,75 @@ +test_that("dotproduct", { + + ## create two dummy spectra (aligned) + x <- matrix(c(c(100.002, 100.001, NA, 300.01, 300.02, NA), + c(2, 1.5, 0, 1.2, 0.9, 0)), ncol = 2) + y <- matrix(c(c(100.0, NA, 200.0, 300.002, 300.025, 300.0255), + intensity = c(2, 0, 3, 1, 4, 0.4)), ncol = 2) + colnames(x) <- colnames(y) <- c("mz", "intensity") + + ## calculate similarity values + expect_warning(dotproduct(x, y, m = 0, n = 2), "not identical") + expect_warning(dotproduct(x, y, n = 2), "not identical") + expect_equal(dotproduct(x, y, m = 0.5, n = 0), 0.4280249, tolerance = 1e-06) + expect_equal(dotproduct(x, y, m = 0, n = 0), 1) + expect_equal(dotproduct(x, y, n = 0), 0.4280249, tolerance = 1e-06) + + + ## if the identical spectra are passed, dotproduct has to return 1 + expect_equal(dotproduct(x, x, n = 2), 1) + expect_equal(dotproduct(y, y, n = 2), 1) + + ## check exceptions + expect_error(dotproduct(x[-1, ], y), "nrow[(]x[)] and nrow[(]y[)]") + expect_error(dotproduct(x, y[-1, ]), "nrow[(]x[)] and nrow[(]y[)]") + expect_error(dotproduct(x = x), "is missing, with no default") + expect_error(dotproduct(y = y), "is missing, with no default") + expect_error(dotproduct(x = x, y = "a"), "not a matrix") + expect_error(dotproduct(x = "a", y = y), "not a matrix") + expect_error(dotproduct(x, y, m = "a", n = 0), + "has to be a numeric of length 1") + expect_error(dotproduct(x, y, m = 0.5, n = "a"), + "has to be a numeric of length 1") + expect_error(dotproduct(x, y, m = 1:2, n = 0), + "has to be a numeric of length 1") + expect_error(dotproduct(x, y, m = 0.5, n = 1:2), + "has to be a numeric of length 1") + + ## check for columns 'mz' and 'intensity' + x <- matrix(c(1:3, 1:3), ncol = 2) + y <- matrix(c(1:3, 1:3), ncol = 2) + colnames(x) <- c("foo", "intensity") + colnames(y) <- c("mz", "intensity") + expect_error(dotproduct(x, y), "subscript out of bounds") + colnames(x) <- c("mz", "foo") + colnames(y) <- c("mz", "intensity") + expect_error(dotproduct(x, y), "subscript out of bounds") + colnames(x) <- c("mz", "intensity") + colnames(y) <- c("foo", "intensity") + expect_error(dotproduct(x, y), "subscript out of bounds") + colnames(x) <- c("mz", "intensity") + colnames(y) <- c("mz", "foo") + expect_error(dotproduct(x, y), "subscript out of bounds") + + ## test for matrix + x <- list(mz = 1:3, intensity = 1:3) + y <- matrix(c(1:3, 1:3), ncol = 2) + colnames(y) <- c("mz", "intensity") + expect_error(dotproduct(x, y), "is not a matrix") + + x <- matrix(c(1:3, 1:3), ncol = 2) + colnames(x) <- c("mz", "intensity") + y <- list(mz = 1:3, intensity = 1:3) + expect_error(dotproduct(x, y), "is not a matrix") + + ## test for mode 'numeric' + x <- matrix(c(c("a", "b", "c"), c(1:3)), ncol = 2) + y <- matrix(c(1:3, 1:3), ncol = 2) + colnames(x) <- colnames(y) <- c("mz", "intensity") + expect_error(dotproduct(x, y), "non-numeric argument to binary operator") + x <- matrix(c(1:3, 1:3), ncol = 2) + y <- matrix(c(c("a", "b", "c"), c(1:3)), ncol = 2) + colnames(x) <- colnames(y) <- c("mz", "intensity") + expect_error(dotproduct(x, y), "non-numeric argument to binary operator") + +}) \ No newline at end of file diff --git a/tests/testthat/test_graphPeaks.R b/tests/testthat/test_graphPeaks.R new file mode 100644 index 00000000..c482f50f --- /dev/null +++ b/tests/testthat/test_graphPeaks.R @@ -0,0 +1,85 @@ +## function shiftMatrix +library("testthat") + +test_that("shiftMatrix", { + + ## create test matrices + mat_l <- matrix(letters[1:18], ncol = 6, nrow = 3) + ## n: negative, p: positive + mat_n1 <- matrix(c("j", "p", NA, "k", "q", NA, "l", "r", NA), + ncol = 3, nrow = 3, byrow = TRUE) + mat_n2 <- matrix(c("p", NA, NA, "q", NA, NA, "r", NA, NA), + ncol = 3, nrow = 3, byrow = TRUE) + mat_p1 <- matrix(c(NA, "d", "j", NA, "e", "k", NA, "f", "l"), + ncol = 3, nrow = 3, byrow = TRUE) + mat_p2 <- matrix(c(NA, NA, "d", NA, NA, "e", NA, NA, "f"), + ncol = 3, nrow = 3, byrow = TRUE) + + expect_equal(shiftMatrix(mat_l, x = c(2, 4, 6), n = -1), mat_n1) + expect_equal(shiftMatrix(mat_l, x = c(2, 4, 6), n = -2), mat_n2) + expect_equal(shiftMatrix(mat_l, x = c(2, 4, 6), n = 1), mat_p1) + expect_equal(shiftMatrix(mat_l, x = c(2, 4, 6), n = 2), mat_p2) + expect_equal(shiftMatrix(mat_l, x = c(2, 4, 6), n = 0), mat_l[, c(2, 4, 6)]) + expect_error(shiftMatrix(x = c(2, 4, 6), n = 1, def = NA), + "is missing, with no default") + expect_error(shiftMatrix(mat = mat_l, n = 1, def = NA), + "is missing, with no default") + expect_error(shiftMatrix(mat = mat_l, x = c(2, 4, 6), def = NA), + "is missing, with no default") + expect_error(shiftMatrix(mat = mat_l, x = c(2, 4, 6, 8, 10), n = 1, def = NA), + "subscript out of bounds") +}) + + +## function graphPeaks +library("testthat") + +## create example x and y and perform tests +x <- matrix(c(c(100.001, 100.002, 300.01, 300.02), + c(1, 1, 1, 1)), ncol = 2, nrow = 4, byrow = FALSE) +colnames(x) <- c("mz", "intensity") + +y <- matrix(c(c(100.0, 200.0, 300.002, 300.025, 300.0255), + c(1, 1, 1, 1, 1)), ncol = 2, nrow = 5, byrow = FALSE) +colnames(y) <- c("mz", "intensity") + +## create matrices that contain the result +x_match <- matrix(c(c(100.002, 100.001, NA, 300.01, 300.02, NA), + c(1, 1, 0, 1, 1, 0)), ncol = 2, nrow = 6) +colnames(x_match) <- c("mz", "intensity") + +y_match <- matrix(c(c(100.0, NA, 200.0, 300.002, 300.025, 300.0255), + c(1, 0, 1, 1, 1, 1)), ncol = 2, nrow = 6) +colnames(y_match) <- c("mz", "intensity") + +test_that("graphPeaks", { + ## tests + expect_equal(graphPeaks(x = x, y = x), list(x = x, y = x), + tolerance = 1e-05) + expect_equal(graphPeaks(x = y, y = y), list(x = y, y = y), + tolerance = 1e-05) + expect_equal(graphPeaks(x = x, y = y, m = 0.5, n = 0)$x, x_match, tolerance = 1e-05) + expect_equal(graphPeaks(x = x, y = y, m = 0.5, n = 0)$y, y_match, tolerance = 1e-05) + expect_true(is.list(graphPeaks(x = x, y = y))) + expect_error(graphPeaks(x = x[1, ], y = y), "is not a matrix") + expect_error(graphPeaks(x = x, y = y[[1,]]), "subscript out of bounds") + expect_error(graphPeaks(x = x), "is missing, with no default") + expect_error(graphPeaks(y = y), "is missing, with no default") + expect_error(graphPeaks(x = x, y = y, FUN = max), "attempt to select less") + + ## ppm + expect_error(graphPeaks(x = x, y = y, ppm = "a"), "is not numeric") + expect_error(graphPeaks(x = x, y = y, ppm = -1), "has to be positive") + + ## test for mode + x_chr <- matrix(c("a", "b", "c", "d"), ncol = 2) + expect_error(graphPeaks(x = x_chr, y = y), "is not \'numeric\'") + expect_error(graphPeaks(x = x, y = x_chr), "is not \'numeric\'") + + ## test for colnames + x_colnames <- matrix(c(c(100.001, 100.002, 300.01, 300.02), + c(1, 1, 1, 1)), ncol = 2, nrow = 4, byrow = FALSE) + expect_error(graphPeaks(x = x_colnames, y = y), "subscript out of bounds") + expect_error(graphPeaks(x = y, y = x_colnames), "subscript out of bounds") + +})