diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index c6ac331..db5a06c 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,7 +1,32 @@ +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/ITS/projects/NERC_TRANSISTION/events/Clean Air Networks Conference/naei estimate.Rmd="1BB48545" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/_isolateContribution&breakPointAnalysis_KR_20230824.R="F18B98A3" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/_projects/_paper_01_IntroToRespeciate/MS Access Versions/speciate_5.2_0/test.R="FCE2E494" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/_projects/marylebone02/_marylebone_metals_03.Rmd="63CDB89D" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/_projects/match01/_match_notes_01.Rmd="A446C96C" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/.Rbuildignore="BAFF788D" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/.Rproj.user/shared/notebooks/paths="DB89FCB7" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/.github/workflows/R-CMD-check.yml="33EF2D32" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/.gitignore="2CB231D9" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/DESCRIPTION="8BA937B7" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/NAMESPACE="DBDDE80C" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/NEWS.md="F6ED8BF4" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/respeciate-package.R="A43C9569" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/respeciate.generics.R="54ECE8F1" -C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.001.R="0357CDCE" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.R="58F2A9BE" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.average.R="0A1E36E4" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.cluster.R="49C0F861" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.cor.R="3F1DA8E5" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.info.R="D0E7AC03" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.match.R="4326C1C3" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.pad.R="4962C940" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.pls.R="EACFE9A4" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.rescale.R="D668C5B2" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sp.reshape.R="CC655C60" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/speciate.R="6C6E673A" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/spq.R="3AD9ACC8" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/spx.R="CA18044A" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/sysdata.R="82103C52" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/R/xxx.R="3415FF44" C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/README.Rmd="887EDA27" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/README.md="D46A00DB" +C:/Users/trakradmin/OneDrive - University of Leeds/Documents/pkg/respeciate/test/respeciate/man/spec.Rd="8472593F" diff --git a/DESCRIPTION b/DESCRIPTION index 66551aa..97131a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,14 @@ Package: respeciate Title: Speciation profiles for gases and aerosols -Version: 0.2.1 -Date: 2023-06-21 -Description: Acess to the US.EPA Speciate (v5.2) tool, to generate speciation for gases and particles. - More details in Simon et al (2010) . +Version: 0.2.3 +Date: 2023-09-23 +Description: Acess to the US.EPA Speciate (v5.2) tool, to generate speciation profiles for + gases and particles. More details in Simon et al (2010) . Type: Package -Authors@R: c( person(given = "Sergio", family = "Ibarra-Espinosa", role = c("aut", "cre"), email = - "sergio.ibarra@usp.br", comment = c(ORCID = "0000-0002-3162-1905")), person(given = "Karl", - family = "Ropkins", role = c("aut"), email = "k.ropkins@its.leeds.ac.uk", comment = c(ORCID - = "0000-0002-0294-6997")) ) +Authors@R: c( person(given = "Sergio", family = "Ibarra-Espinosa", role = c("aut", "cre"), + email = "sergio.ibarra@usp.br", comment = c(ORCID = "0000-0002-3162-1905")), + person(given = "Karl", family = "Ropkins", role = c("aut"), email = + "k.ropkins@its.leeds.ac.uk", comment = c(ORCID = "0000-0002-0294-6997")) ) License: MIT + file LICENSE URL: https://github.com/atmoschem/respeciate BugReports: https://github.com/atmoschem/respeciate/issues @@ -16,3 +16,4 @@ LazyData: yes Depends: R (>= 3.5.0) RoxygenNote: 7.2.3 Encoding: UTF-8 +Imports: data.table diff --git a/NAMESPACE b/NAMESPACE index b640aa1..cb9c27a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,13 +2,80 @@ S3method(plot,respeciate) S3method(print,respeciate) -S3method(print,respeciate.ref) -S3method(print,respeciate.spcs) +S3method(summary,respeciate) export(find_code) +export(pls_fit_parent) +export(pls_plot) +export(pls_plot_profile) +export(pls_plot_species) +export(pls_refit_species) +export(pls_report) +export(sp_average_profile) +export(sp_dcast) +export(sp_dcast_profile) +export(sp_dcast_species) export(sp_find_profile) export(sp_find_species) +export(sp_info) +export(sp_match_profile) +export(sp_melt_wide) +export(sp_pad) +export(sp_pls_profile) export(sp_profile) +export(sp_profile_distance) +export(sp_profile_info) +export(sp_rescale) +export(sp_rescale_profile) +export(sp_rescale_species) +export(sp_species_cor) +export(sp_species_info) export(spec) +export(spq_gas) +export(spq_other) +export(spq_pm) +export(spq_pm.ae6) +export(spq_pm.ae8) +export(spq_pm.cr1) +export(spq_pm.simplified) +export(spx_btex) +export(spx_copy) +export(spx_n_alkane) +import(data.table) +importFrom(grDevices,as.graphicsAnnot) +importFrom(grDevices,cm.colors) +importFrom(grDevices,colorRampPalette) +importFrom(grDevices,dev.flush) +importFrom(grDevices,dev.hold) +importFrom(grDevices,heat.colors) +importFrom(graphics,abline) importFrom(graphics,axis) importFrom(graphics,barplot) +importFrom(graphics,grid) +importFrom(graphics,legend) +importFrom(graphics,lines) +importFrom(graphics,mtext) importFrom(graphics,par) +importFrom(graphics,plot.new) +importFrom(graphics,plot.window) +importFrom(graphics,points) +importFrom(graphics,polygon) +importFrom(graphics,rect) +importFrom(graphics,text) +importFrom(graphics,title) +importFrom(stats,AIC) +importFrom(stats,as.formula) +importFrom(stats,coefficients) +importFrom(stats,cophenetic) +importFrom(stats,cor) +importFrom(stats,cutree) +importFrom(stats,dist) +importFrom(stats,formula) +importFrom(stats,hclust) +importFrom(stats,heatmap) +importFrom(stats,lm) +importFrom(stats,nls) +importFrom(stats,nls.control) +importFrom(stats,predict) +importFrom(stats,sd) +importFrom(utils,head) +importFrom(utils,modifyList) diff --git a/NEWS.md b/NEWS.md index ae27469..91b8ee8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,26 +1,43 @@ -NEWS -=========== +# Version 0.0 - Release Notes -### respeciate 0.2.1 (Release date: 2023-06-21) +* [0.2.3] + * released 2023-09-26 + * added functions: sp_rescale_species, sp_dcast, sp_dcast_species + sp_pad, sp_pls_profile + * added draft spq_ quick profile build functions + * added pls_ functions for sp_pls_profile outputs + * sp_profile update: references not included by default + * sp_match: changed n to matches; added min.n (so more consistent + with other functions); now averages x + * sp_species_cor: added heatmap and key list options -- updated sysdata (now using SPECIATE 5.2) -- added sp\_find\_species -- extended sp\_find\_profile; now allows by='species_name' -- simplified plot.respeciate +* [0.2.2] + * released 2023-08-24 + * now imports data.table; moving to data.table methods for speed + * changed news format because build_news missing previous md text + * added functions: sp_rescale_profile, sp_dcast_profile, sp_species_cor, + sp_profile_dist, sp_match_profile, sp_melt_wide + * reduced object classes + +* [0.2.1] + * released 2023-06-21 + * updated sysdata (now using SPECIATE 5.2) + * added sp_find_species + * extended sp_find_profile; now searches by species_name + * simplified plot.respeciate +* [0.2.0] + * released 2021-05-26 + * Karl Ropkins joined + * moved your sysdata.rda to the package data folder + * reset lazy.data to TRUE + * added an object class + * added sp_find_profile (find_code but making object class) + * added sp_profile (spec but making object class) + * added crude print and plot methods for object classes + * updated date and version + * The new code is in R:speciate.0.2.r -### respeciate 0.2.0 (Release date: 2021-05-26) - -- Karl Ropkins joined -- moved your sysdata.rda to the package data folder -- reset lazy.data to TRUE -- added an object class -- added sp\_find\_profile (find\_code but making object class) -- added sp\_profile (spec but making object class) -- added crude print and plot methods for object classes -- updated date and version -- The new code is in R/speciate.0.2.r - -### respeciate 0.1.0 (Release date: 2020-12-20) - -- Create respeciate +* [0.1.0] + * released 2020-12-20 + * Created respeciate diff --git a/R/respeciate.generics.R b/R/respeciate.generics.R index c44a8d9..1f543be 100644 --- a/R/respeciate.generics.R +++ b/R/respeciate.generics.R @@ -1,75 +1,133 @@ #' @name respeciate.generics #' @title respeciate.generics -#' @importFrom graphics axis barplot par - -#' @description respeciate object classes and generic functions - -#reversed order of documentation, -#started with the find function... +#' @description \code{respeciate} object classes and generic functions. +######################## +#might move all the text to top +# hard to keep style consistent when docs are in between +# multiple functions #' @description When supplied a \code{respeciate} #' object, \code{\link{print}} manages its appearance. +#' @description When supplied a \code{respeciate} +#' object, \code{\link{plot}} provides a basic plot +#' output. This uses base function \code{\link{barplot}}; +#' also see note. #' @param x the \code{respeciate} #' object to be printed, plotted, etc. -#' @param n when plotting or printing a multi-profile, the -#' number(s) of profile(s) to report. +#' @param n when plotting or printing a multi-profile object, the +#' maximum number of profiles to report. (When plotting, \code{n} +#' is ignored if \code{id} is also set.) #' @param ... any extra arguments, mostly ignored except by #' \code{plot} which passes them to \code{\link{barplot}}. +#' @param object like \code{x} but for \code{summary}. +#' @param id numeric, indices of profiles to use when +#' plotting (\code{id=1:6} is equivalent to \code{n=6}). +#' @param order logical, order the species in the +#' profile(s) by relative abundance before plotting. +#' @note \code{respeciate} objects revert to +#' \code{data.frame}s when not doing anything +#' package-specific, so you can still +#' use as previously with \code{lattice} or +#' \code{ggplot2}, useful if you are pulling multiple +#' profiles and you exceed the base \code{\link{barplot}} +#' capacity... + + +#notes +################################## + +#loosing the respeciate.ref and respeciate.spcs classes +# testing merge them into respeciate + +# with different print outputs? +# only plot for class plot, etc??? + #' @rdname respeciate.generics #' @method print respeciate #' @export -print.respeciate <- - function(x, n=6, ...){ - #shifted from REF_Code to PROFILE_Code - # when allowing multipe profiles - # because REF_Code not unique - y <- unique(x$PROFILE_CODE) - report <- paste("respeciate profile(s): count ", - length(y), "\n", sep="") - if(length(y)==0){ - cat(paste(report, - "empty (or bad?) respeciate object\n", - sep="")) - return(invisible(x)) - } - yy <- if(length(y)>n) {y[1:n]} else {y} - for(i in yy){ - report <- paste(report, " ", i, " (checksum: ", - sum(as.numeric(as.character(x[x$PROFILE_CODE==i,]$WEIGHT_PERCENT)), - na.rm = T), ")\n", sep="") +print.respeciate <- function(x, n = NULL, ...){ + test <- rsp_test_respeciate(x, level = 2, silent = TRUE) + if(test == "respeciate.profile.ref"){ + if(is.null(n)){ + n <- 100 } - if(length(y)>n){ - report <- paste(report, " ... not showing last ", - length(y)-n, "\n", sep="") + return(rsp_print_respeciate_profile(x=x, n=n, ...)) + } + if(test == "respeciate.species.ref"){ + if(is.null(n)){ + n <- 10 } - cat(report, sep="") - invisible(x) + return(rsp_print_respeciate_species(x=x, n=n, ...)) + } + if(is.null(n)){ + n <- 10 } + rsp_print_respeciate(x=x, n=n, ...) +} + +## rsp_print functions unexported +## further down +## VVVVVVVVVVVV +## VVVVVVVVVVVV + + -#' @description When supplied a \code{respeciate} -#' object, \code{\link{plot}} provides a basic plot -#' output. This uses base function \code{\link{barplot}}; -#' also see note. -#' @note \code{respeciate} objects revert to -#' \code{data.frame}s when not doing anything -#' package-specific, so you can still -#' use as previously with \code{lattice} or -#' \code{ggplot2}, useful if you are pulling multiple -#' profiles and you exceed the base \code{\link{barplot}} -#' capacity... -#' @param id numeric, indices of profiles to use when -#' plotting (nb: \code{n=6} is equivalent to code{id=1:6}). -#' @param order logical, order the species in the -#' profile(s) by relative abundance before plotting. #' @rdname respeciate.generics #' @method plot respeciate #' @export +########################## +#notes +########################## +#like.... +#better handling of factor axis labels +#better handling of axes and legend font sizes +# (I think previous code may have handled this a little better) +# (but not perfectly...) +#like horiz=total scales to be other way around? +# could also mean rethinking the legend position for this? + +############################ +#added warning/handling for +# duplicate species in profiles (handling merge/mean) +# duplicated profile names (handling make unique) + +#test is now set up to use data.table + + + plot.respeciate <- function(x, n=NULL, id=NULL, order=TRUE, ...){ + #add .value if not there + ## don't think .value works + x <- rsp_tidy_profile(x) + + ##test object type + test <- rsp_test_respeciate(x, level=2, silent=TRUE) + if(test != "respeciate"){ + if(test %in% c("respeciate.profile.ref", "respeciate.species.ref")){ + stop("No plot method for respeciate.reference files.") + } else { + stop("suspect respeciate object!") + } + #don't stop - respeciate profile + } + + ##test something to plot + if(nrow(x)==0){ + ###################### + #think about this + ###################### + #maybe stop() instead??? + #stop("empty respeciate object?") + return(invisible(NULL)) + } + + #hold extra args + # passing to plot .xargs <- list(...) #test number of profiles @@ -91,14 +149,40 @@ plot.respeciate <- "\n", sep="")) } - x <- rsp_test(x) - if(any(x$COUNT>1)){ + x <- rsp_test_profile(x) + + + if(any(x$.n>1)){ + warning(paste("\n\t", + " found duplicate species in profiles (merged and averaged...)", + "\n", sep="")) + } + x$SPECIES_NAME <- rsp_tidy_species_name(x$SPECIES_NAME) + + #################################### + #issue profile names are not always unique + #################################### + test <- x + test$SPECIES_ID <- ".default" + test <- rsp_test_profile(test) + ################### + #rep_test + #can now replace this with data.table version + #BUT check naming conventions for .n + ################### + + #does this need a warning? + if(length(unique(test$PROFILE_NAME))100){ - report <- c(x$PROFILE_CODE[1:100], "...") - comment <- " profiles [showing first 100]\n" + + +# see below about alternative summary output +# including check sum? + +# maybe table +# profile (code); (name); type; n.species (count); checksum; comments +# just show some but send all + +# replacing previous + +summary.respeciate <- + function(object, ...){ + #v0.1 summary + #n <- object$PROFILE_TYPE + #n <- n[!duplicated(object$PROFILE_CODE)] + #summary(factor(n)) + + #v0.3 summary + + xx <- as.data.table(object) + + .xargs <- list(...) + .max.n <- if("max.n" %in% names(.xargs)){ + .xargs$max.n } else { - report <- x$PROFILE_CODE - comment <- " profiles\n" + 10 + } + .silent <- if("silent" %in% names(.xargs)){ + .xargs$silent + } else { + FALSE } - if(xx>0) cat(report, fill=wi) - cat(" > ", xx, comment, sep="") - invisible(x) - } + #check what we have + test <- c("WEIGHT_PERCENT","PROFILE_NAME","PROFILE_TYPE", + "SPECIES_ID") + test <- test[ test %in% colnames(xx)] + if(!"PROFILE_CODE" %in% colnames(xx)){ + xx$PROFILE_CODE <- "{{ICK}}" + } -#' @rdname respeciate.generics -#' @method print respeciate.spcs -#' @export -print.respeciate.spcs <- - function(x, ...){ - xx <- nrow(x) - wi <- getOption("width") - #################################### - #use of cat might need rethinking? - #################################### - cat("respeciate species\n") - if(xx>10){ - report <- c(x$SPECIES_NAME[1:10], "...") - comment <- " species [showing first 10]\n" - } else { - report <- x$SPECIES_NAME - comment <- " species\n" + out <- xx[, + .(#SPECIES_NAME = SPECIES_NAME[1], + #SPEC_MW = SPEC_MW[1], + .checksum = if("WEIGHT_PERCENT" %in% names(xx)){ + sum(WEIGHT_PERCENT, na.rm = TRUE) + } else { + NA + }, + .checkname = if("PROFILE_NAME" %in% names(xx)){ + length(unique(PROFILE_NAME))} + else { + NA + }, + .name = if("PROFILE_NAME" %in% names(xx)){ + PROFILE_NAME[1] + } else { + NA + }, + .type = if("PROFILE_TYPE" %in% names(xx)){ + PROFILE_TYPE[1] + } else { + NA + }, + .nspecies = if("SPECIES_ID" %in% names(xx)){ + length(unique(SPECIES_ID)) + } else { + NA + } + ), + by=.(PROFILE_CODE)] + + #out <- merge(xx, out, by="PROFILE_CODE", all.x=TRUE, all.y=FALSE, + # allow.cartesian=TRUE) + + out$PROFILE_CODE[out$PROFILE_CODE=="{{ICK}}"] <- NA + out <- as.data.frame(out) + if(!.silent){ + if(nrow(out) > .max.n){ + print(head(out[c(1,2,5,6)], n = .max.n)) + cat(" [forestortened - showing ", .max.n, " of ", nrow(out), "]", + sep="") + } else { + print(out[c(1,2,5,6)]) + } } - if(xx>0) cat(report, fill=wi) - cat(" > ", xx, comment, sep="") - invisible(x) + invisible(out) + + + + } + + ################################# #unexported code ################################# + +# like to do something like this for summary +# but code very messy +# AND run time for 100+ profiles is too slow... + +# try with data.table... + + +#rsp_summary_v3 <- function(object, ...){ + +# xx <- as.data.table(object) + +# .xargs <- list(...) +# .max.n <- if("max.n" %in% names(.xargs)){ +# .xargs$max.n +# } else { +# 10 +# } +# .silent <- if("silent" %in% names(.xargs)){ +# .xargs$silent +# } else { +# FALSE +# } + + + #check what we have +# test <- c("WEIGHT_PERCENT","PROFILE_NAME","PROFILE_TYPE", +# "SPECIES_ID") +# test <- test[ test %in% colnames(xx)] +# if(!"PROFILE_CODE" %in% colnames(xx)){ +# xx$PROFILE_CODE <- "{{ICK}}" +# } + +# out <- xx[, +# .(#SPECIES_NAME = SPECIES_NAME[1], +# #SPEC_MW = SPEC_MW[1], +# .checksum = if("WEIGHT_PERCENT" %in% names(xx)){ +# sum(WEIGHT_PERCENT, na.rm = TRUE) +# } else { +# NA +# }, +# .checkname = if("PROFILE_NAME" %in% names(xx)){ +# length(unique(PROFILE_NAME))} +# else { +# NA +# }, +# .name = if("PROFILE_NAME" %in% names(xx)){ +# PROFILE_NAME[1] +# } else { +# NA +# }, +# .type = if("PROFILE_TYPE" %in% names(xx)){ +# PROFILE_TYPE[1] +# } else { +# NA +# }, +# .nspecies = if("SPECIES_ID" %in% names(xx)){ +# length(unique(SPECIES_ID)) +# } else { +# NA +# } +# ), +# by=.(PROFILE_CODE)] + + #out <- merge(xx, out, by="PROFILE_CODE", all.x=TRUE, all.y=FALSE, + # allow.cartesian=TRUE) + +# out$PROFILE_CODE[out$PROFILE_CODE=="{{ICK}}"] <- NA +# out <- as.data.frame(out) +# if(!.silent){ +# if(nrow(out) > .max.n){ +# print(head(out[c(1,2,5,6)], n = .max.n)) +# cat(" [forestortened - showing ", .max.n, " of ", nrow(out), "]", +# sep="") +# } else { +# print(out[c(1,2,5,6)]) +# } +# } +# invisible(out) + +#} + +#rsp_summary_v2 <- +# function(object, ...){ +# #v0.2 summary +# if(!"PROFILE_CODE" %in% names(object)){ +# object$PROFILE_CODE <- "{{NA}}" +# } +# ref <- unique(object$PROFILE_CODE) +# if(length(ref)>10){ +# ref <- ref[1:100] +# } +# .out <- lapply(ref, function(x){ +# .tmp <- subset(object, PROFILE_CODE==x) +# .x <- if(x=="{{NA}}") {NA} else {x} +# .pt <- if("PROFILE_TYPE" %in% names(.tmp)){ +# .tmp$PROFILE_TYPE[1] +# } else { +# NA +# } +# .pn <- if("PROFILE_NAME" %in% names(.tmp)){ +# .tmp$PROFILE_NAME[1] +# } else { +# NA +# } +# .ns <- if("SPECIES_ID" %in% names(.tmp)){ +# length(unique(.tmp$SPECIES_ID)) +# } else { +# NA +# } +# .cs <- if("WEIGHT_PERCENT" %in% names(.tmp)){ +# sum(.tmp$WEIGHT_PERCENT, na.rm=TRUE) +# } else { +# NA +# } +# data.frame(profile=.x, +# type=.pt, +# name=.pn, +# n.species=.ns, +# checksum=.cs) +# }) +# .out <- do.call(rbind, .out) +# print(.out[c("profile", "type", "n.species", "checksum")], max=40) +# return(invisible(.out)) +# } + + + + ################################### #class builds ################################### -rsp_build_respeciate.spcs <- - function(x, ...){ +#rsp_build_respeciate.spcs <- +# function(x, ...){ #build - class(x) <- c("respeciate.spcs", "data.frame") - x - } - -rsp_build_respeciate.ref <- - function(x, ...){ + #add .value +# x <- rsp_tidy_profile(x) +# class(x) <- c("respeciate.spcs", "data.frame") +# x +# } + +#rsp_build_respeciate.ref <- +# function(x, ...){ #build - class(x) <- c("respeciate.ref", "data.frame") - x - } +# class(x) <- c("respeciate.ref", "data.frame") +# x +# } rsp_build_respeciate <- function(x, ...){ @@ -250,50 +517,154 @@ rsp_build_respeciate <- x } + ########################### -#tidy names +#split respeciate by profile ########################### #currently not exported -#quick code to tidy species names +#quick code assumed CODE is unique to profile -#note: not fully tested +#need to test this -rsp_tidy_species_name <- function(x){ +#not sure we are using this any more +# i think rsp_test, then rsp_test.2 replaced +# and code in plot.respeciate.old ??? - #attempts shorten names by remove other versions - #names seem to be in format a (or b || c) - #where (guessing) a is main name and - # b and c are alternatives. +rsp_split_profile <- function(x){ + ref <- unique(x$PROFILE_CODE) + lapply(ref, function(y) x[x$PROFILE_CODE==y,]) +} - #not fully tested, - # might still be more cases this dies on - #gsub("[(].*","", x) failed if name a includes brackets - #example:#"(2-methylpropyl)benzene (or isobutylbenzene)" +######################## +#unexported rsp_print +######################## - #sub("[(][^(]or+$", "", x) failed if b or c includes brackets - #also left space at end so needed sub("^\\s+|\\s+$", "", x) +## like to tidy this/these + +# respeciate profile(s) +# [profile_code] [check sum] [profile_name] < width limited +# ... showing n + +# added profile_name to output + +# could move check sum to summary and +# replace with species count??? + +# doing this... previous +###.msg <- paste(" ", i, " (checksum: ", +##round(sum(as.numeric(as.character(x[x$PROFILE_CODE==i,]$WEIGHT_PERCENT)), +## na.rm = T), digits=2), +##") ", i2, "\n", sep="") + +# could make other respeciate print outputs +# look like this? + +rsp_print_respeciate <- + function(x, n=6, ...){ + #profile_code is (I think) only term unique to a profile + y <- unique(x$PROFILE_CODE) + report <- paste("respeciate profile(s): count ", + length(y), "\n", sep="") + if(length(y)==0){ + cat(paste(report, + "empty (or bad?) respeciate object\n", + sep="")) + return(invisible(x)) + } + .tmp <- getOption("width") + yy <- if(length(y)>n) {y[1:n]} else {y} + for(i in yy){ + if("PROFILE_NAME" %in% names(x)){ + i2 <- x$PROFILE_NAME[x$PROFILE_CODE==i][1] + } else { + i2 <- "[unknown]" + } + if("SPECIES_ID" %in% names(x)){ + .spe <- length(unique(x$SPECIES_ID[x$PROFILE_CODE==i])) + } else { + .spe <- "0!" + } + .msg <- paste(" ", i, " (", .spe, " species) ", + i2, "\n", sep="") + if(nchar(.msg)>.tmp){ + .msg <- paste(substring(.msg, 1, .tmp-3), "...\n") + } + report <- paste(report, .msg, sep="") + } + if(length(y)>n){ + report <- paste(report, " ... not showing last ", + length(y)-n, "\n", sep="") + } + cat(report, sep="") + invisible(x) + } + + +## #' @description When supplied a \code{respeciate.ref} +# #' object, \code{\link{print}} manages its appearance. +# #' @param x the \code{respeciate} or \code{respeciate.ref} +# #' object to be printed, plotted, etc. +# #' @rdname respeciate.generics +# #' @method print respeciate.ref +# #' @export +rsp_print_respeciate_profile <- + function(x, n = 100, ...){ + xx <- nrow(x) + wi <- getOption("width") + #################################### + #use of cat might need rethinking? + #################################### + cat("respeciate profile reference\n") + if(n>xx){ + n <- xx + } + if(xx>n){ + report <- c(x$PROFILE_CODE[1:n], "...") + comment <- paste(" profiles [showing first ", n, + "]\n", sep = "") + } else { + report <- x$PROFILE_CODE + comment <- " profiles\n" + } + if(xx>0) cat(report, fill=wi) + cat(" > ", xx, comment, sep="") + invisible(x) + } + + +# #' @rdname respeciate.generics +# #' @method print respeciate.spcs +# #' @export +rsp_print_respeciate_species <- + function(x, n = 10, ...){ + xx <- nrow(x) + wi <- getOption("width") + #################################### + #use of cat might need rethinking? + #################################### + cat("respeciate species reference\n") + if(n>xx){ + n <- xx + } + if(xx>n){ + report <- c(x$SPECIES_NAME[1:n], "...") + comment <- paste(" species [showing first ", n, + "]\n", sep="") + } else { + report <- x$SPECIES_NAME + comment <- " species\n" + } + if(xx>0) cat(report, fill=wi) + cat(" > ", xx, comment, sep="") + invisible(x) + } - #sometimes it is "( or " - x <- gsub(" [(] or ", " (or ", x) - #next removes from last "(or" onwards - x <- gsub("[(]or .*","", x) - sub("^\\s+|\\s+$", "", x) -} -########################### -#split respeciate by profile -########################### -#currently not exported -#quick code assumed CODE is unique to profile -rsp_split_profile <- function(x){ - ref <- unique(x$PROFILE_CODE) - lapply(ref, function(y) x[x$PROFILE_CODE==y,]) -} @@ -427,26 +798,6 @@ plot.respeciate.old <- } } -rsp_test <- function(x){ - .prf <- unique(x$PROFILE_CODE) - ans <- lapply(.prf, function(y){ - temp <- subset(x, PROFILE_CODE==y) - .spc <- unique(temp$SPECIES_ID) - ans <- lapply(.spc, function(z){ - temp2 <- subset(temp, SPECIES_ID==z) - data.frame(PROFILE_CODE = y, - PROFILE_NAME = temp2$PROFILE_NAME[1], - SPECIES_ID = z, - SPECIES_NAME = temp2$SPECIES_NAME[1], - COUNT = length(temp2$WEIGHT_PERCENT[!is.na(temp2$WEIGHT_PERCENT)]), - TOTAL = sum(temp2$WEIGHT_PERCENT[!is.na(temp2$WEIGHT_PERCENT)]), - SPEC_MW = temp2$SPEC_MW[1], - WEIGHT_PERCENT=mean(temp2$WEIGHT_PERCENT, na.rm=TRUE)) - }) - do.call(rbind, ans) - }) - do.call(rbind, ans) -} diff --git a/R/sp.001.R b/R/sp.001.R deleted file mode 100644 index a57da41..0000000 --- a/R/sp.001.R +++ /dev/null @@ -1,178 +0,0 @@ -#' @name sp.001 -#' @title sp_ functions -#' @aliases sp_find_profile sp_find_species sp_profile - -utils::globalVariables(c("SPECIES_ID")) - -#' @description sp functions for use with (re)speciate data in R... - -#reversed order of documentation, -#started with the find function... - -#' @description \code{sp_find} functions search main data sets -#' in the (re)SPECIATE archive using supplied search terms. -#' \code{\link{sp_find_profile}} searches for profile records and -#' \code{\link{sp_find_species}} searches for species records. -#' @param ... for \code{sp_find} functions, character(s), any -#' search term(s) to use when searching the local (re)SPECIATE archive for -#' relevant records. -#' @param by character, the section of the archive to -#' search, by default \code{'keywords'} for \code{\link{sp_find_profile}} and -#' \code{'species_names'} for \code{\link{sp_find_species}}. -#' @param partial logical, if \code{TRUE} (default) -#' \code{sp_find} functions use partial matching. -#' @return \code{sp_find_profile} returns a object of -#' \code{respeciate.ref} class, a \code{data.frame} of -#' profile information. -#' @examples \dontrun{ -#' profile <- "Ethanol" -#' dt <- sp_find_profile(profile) -#' dt} -#' - -#' @rdname sp.001 -#' @export - -sp_find_profile <- function(..., by = "keywords", partial = TRUE) { - #extract profile info from archive - out <- sysdata$PROFILES - terms <- c(...) - ################################### - #ignoring case because missing loads... - # might want to think about spaces as well??? - #currently same in sp_find_species - # should think about using common code??? - # also error messaging if by is not known??? - ################################### - #how to handle searching a profile - #that contains a specific species - # this is all the profiles that have an entry for species_id 529 - # sysdata$SPECIES$PROFILE_CODE[sysdata$SPECIES$SPECIES_ID==529] - # could use sp_find_species to get species info, then - ################################### - if(tolower(by) %in% c("species_name")){ - ############################ - #special case - #search by species_name - # could add species_id, cas, etc? - ############################### - species <- sysdata$SPECIES - ref <- out$PROFILE_CODE - for(ti in terms){ - ans <- sp_find_species(ti, by=by, partial=partial) - terms <- species$PROFILE_CODE[species$SPECIES_ID %in% ans$SPECIES_ID] - ref <- ref[ref %in% terms] - } - out <- out[out$PROFILE_CODE %in% ref,] - } else { - for(ti in terms){ - ref <- out[,tolower(names(out))==by] - if(nrow(out)>0){ - out <- if(partial){ - out <- out[grep(ti, ref, ignore.case = TRUE),] - } else { - out <- out[tolower(ti)==tolower(ref),] - } - } - } - } - out <- rsp_build_respeciate.ref(out) - return(out) -} - -#' @return \code{sp_find_species} returns a object of -#' \code{respeciate.spcs} class, a \code{data.frame} of -#' species information. -#' @examples \dontrun{ -#' species <- "Ethanol" -#' sp <- sp_find_species(species) -#' sp} -#' - -#' @rdname sp.001 -#' @export - -sp_find_species <- function(..., by = "species_name", partial = TRUE) { - #extract species info from archive - out <- sysdata$SPECIES_PROPERTIES - terms <- c(...) - for(ti in terms){ - ref <- out[,tolower(names(out))==by] - if(nrow(out)>0){ - out <- if(partial){ - out <- out[grep(ti, ref, ignore.case = TRUE),] - } else { - out <- out[tolower(ti)==tolower(ref),] - } - } - } - #out <- PROFILES[grep(term, PROFILES[[by]], ignore.case = TRUE), ] - out <- rsp_build_respeciate.spcs(out) - return(out) -} - - - - - - -#' @description \code{\link{sp_profile}} extracts a -#' SPECIATE profile from the local (re)SPECIATE archive. -#' @param code character or numeric, the SPECIATE code -#' of the required profile (EPA SPECIATE term PROFILE_CODE). -#' @return \code{sp_profile} returns a object of -#' \code{respeciate} class, a \code{data.frame} containing a -#' speciate profile. -#' @references -#' Simon, H., Beck, L., Bhave, P.V., Divita, F., Hsu, Y., Luecken, D., -#' Mobley, J.D., Pouliot, G.A., Reff, A., Sarwar, G. and Strum, M., 2010. -#' The development and uses of EPA SPECIATE database. -#' Atmospheric Pollution Research, 1(4), pp.196-206. -#' @examples \dontrun{ -#' x <- sp_profile(c(8833, 8850)) -#' plot(x)} - -#NOTE - -#get_profile allows you to get multiple profiles -#not sure this is staying - - - -#' @rdname sp.001 -#' @export - -sp_profile <- function(code) { - #handle numerics/characters - ####################### - #could replace code with ...??? - ###################### - if(is.numeric(code)) code <- as.character(code) - if(!is.character(code)) stop("unexpected code class") - - PROFILES <- sysdata$PROFILES - SPECIES <- sysdata$SPECIES - SPECIES_PROPERTIES <- sysdata$SPECIES_PROPERTIES - PROFILE_REFERENCE <- sysdata$PROFILE_REFERENCE - REFERENCES <- sysdata$REFERENCES - - #handle multiple codes - ############################ - #go direct with %in% ??? - #text as sp_profile2 ??? - ############################ - df <- lapply(code, function(x){ - df <- PROFILES[PROFILES$PROFILE_CODE == x, ] - df <- merge(df, SPECIES, by = "PROFILE_CODE") - df <- merge(df, SPECIES_PROPERTIES, by = "SPECIES_ID") - df <- merge(df, PROFILE_REFERENCE, by = "PROFILE_CODE") - df <- merge(df, REFERENCES, by = "REF_Code") - df - }) - #build - df <- do.call(rbind, df) - df <- rsp_build_respeciate(df) - return(df) -} - - diff --git a/R/sp.R b/R/sp.R new file mode 100644 index 0000000..d1f1c8b --- /dev/null +++ b/R/sp.R @@ -0,0 +1,191 @@ +#' @name sp +#' @title sp_ functions +#' @aliases sp_profile + + +#' @description sp function to get profiles from the R (re)SPECIATE archive + +#' @description \code{\link{sp_profile}} extracts a +#' SPECIATE profile from the local (re)SPECIATE archive. +#' @param code character, numeric or data.frame, the SPECIATE code +#' of the required profile (EPA SPECIATE identifier PROFILE_CODE). This is +#' typically one or concatenated character or numeric entries, but can also +#' be a \code{respeciate} object or similar \code{data.frame} containing +#' the \code{code}s as a named \code{PROFILE_NAME} column. +#' @param include.refs logical, include profile reference information when +#' getting the requested profile(s) from the archive, default \code{FALSE}. +#' @return \code{sp_profile} returns a object of +#' \code{respeciate} class, a \code{data.frame} containing a +#' (re)SPECIATE profile. +#' @note The option \code{include.refs} adds profile source reference +#' information to the returned \code{respeciate} data set. The default option +#' is to not include these because some profiles have several associated +#' references and including these replicates records, once per reference. +#' \code{respeciate} code is written to handle this but if you are developing +#' own methods or code and include references in any profile build you may be +#' biasing some analyses in favor of those multiple-reference profile unless +#' you check and account such cases. +#' @references +#' Simon, H., Beck, L., Bhave, P.V., Divita, F., Hsu, Y., Luecken, D., +#' Mobley, J.D., Pouliot, G.A., Reff, A., Sarwar, G. and Strum, M., 2010. +#' The development and uses of EPA SPECIATE database. +#' Atmospheric Pollution Research, 1(4), pp.196-206. +#' @examples \dontrun{ +#' x <- sp_profile(c(8833, 8850)) +#' plot(x)} + +#NOTES +####################### + +#to think about +####################### + +#add functions??? + +## sp_build_profile to make a profile locally +## needs profile_name, profile_code +## species_name, species_id +## weight_percent (and possibly .value) + +## sp_import_profile to import a profile from an external source +## extension of above to import data from specific sources +## might be very code intensive..? + +## local function to pad data using database??? + +#' @rdname sp +#' @export +## (now importing via xxx.r) +## #' @import data.table + +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + +##################### +#to think about +##################### +# not sure but I think something in the main build: +# (default; include.refs = FALSE) +# PROFILES>>SPECIES>>SPECIES_PROPERTIES +# (full build; include.refs = TRUE) +# PROFILES>>SPECIES>>SPECIES_PROPERTIES>>PROFILE_REFERENCE>>REFERENCES +# is replicating profiles. +# + +#v 0.2 +# based on previous sp_profile but using data.table +# (0.1 version currently unexported sp_profile.old) + +sp_profile <- function(code, include.refs=FALSE) { + + # code currently handles: + # respeciate.ref, numerics and characters characters + + ####################### + #could replace code with ...??? + ###################### + + if(is.data.frame(code) && "PROFILE_CODE" %in% names(code)){ + code <- unique(code$PROFILE_CODE) + } + if(is.numeric(code)) code <- as.character(code) + if(!is.character(code)) { + stop("unexpected 'code' class", + call.=FALSE) + } + + PROFILES <- as.data.table(sysdata$PROFILES) + SPECIES <- as.data.table(sysdata$SPECIES) + SPECIES_PROPERTIES <- as.data.table(sysdata$SPECIES_PROPERTIES) + PROFILE_REFERENCE <- as.data.table(sysdata$PROFILE_REFERENCE) + REFERENCES <- as.data.table(sysdata$REFERENCES) + + ########################## + #testing tolower below + # as a fix for code arg case sensitivity + ########################## + # could test replacing some of this with sp_pad??? + # IF sp_pad stays + df <- PROFILES[tolower(PROFILES$PROFILE_CODE) %in% tolower(code),] + df <- merge(df, SPECIES, by = "PROFILE_CODE", all.y=FALSE, all.x=TRUE, + allow.cartesian=TRUE) + df <- merge(df, SPECIES_PROPERTIES, by = "SPECIES_ID", all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + if(include.refs){ + df <- merge(df, PROFILE_REFERENCE, by = "PROFILE_CODE", all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + df <- merge(df, REFERENCES, by = "REF_Code", all.y=FALSE, all.x=TRUE, + allow.cartesian=TRUE) + } + df <- df[order(df$PROFILE_CODE, decreasing = FALSE),] + + #build + #note: currently adding .value in rsp_build_respeciate + # could do it here? + # leaving there for now... because we would + # still have to do it there for self-build or + # imported profiles... + df <- rsp_build_respeciate(as.data.frame(df)) + return(df) +} + + + + + + + + + + + + +############################# +#unexported & previous code +############################# + +#sp_profile v 0.1 +#now unexported + +rsp_profile.old <- function(code) { + #handle numerics/characters + ####################### + #could replace code with ...??? + ###################### + if(class(code)[1] == "respeciate" && "PROFILE_CODE" %in% names(code)){ + code <- unique(code$PROFILE_CODE) + } + if(is.numeric(code)) code <- as.character(code) + if(!is.character(code)) stop("unexpected code class") + + PROFILES <- sysdata$PROFILES + SPECIES <- sysdata$SPECIES + SPECIES_PROPERTIES <- sysdata$SPECIES_PROPERTIES + PROFILE_REFERENCE <- sysdata$PROFILE_REFERENCE + REFERENCES <- sysdata$REFERENCES + + #handle multiple codes + ############################ + #replace previous lapply with a direct %in% + ## df <- lapply(code, function(x){ + ## df <- PROFILES[PROFILES$PROFILE_CODE == x, ] + ## ... + ## }) + ## df <- do.call(rbind, df) + #testing as sp_profile.2 + #faster with data.table + ############################ + df <- PROFILES[PROFILES$PROFILE_CODE %in% code,] + df <- merge(df, SPECIES, by = "PROFILE_CODE", all.y=FALSE, all.x=TRUE) + df <- merge(df, SPECIES_PROPERTIES, by = "SPECIES_ID", all.y=FALSE, all.x=TRUE) + df <- merge(df, PROFILE_REFERENCE, by = "PROFILE_CODE", all.y=FALSE, all.x=TRUE) + df <- merge(df, REFERENCES, by = "REF_Code", all.y=FALSE, all.x=TRUE) + df <- df[order(df$PROFILE_CODE, decreasing = FALSE),] +## }) + #build + df <- rsp_build_respeciate(df) + return(df) +} + + + diff --git a/R/sp.average.R b/R/sp.average.R new file mode 100644 index 0000000..b8afd74 --- /dev/null +++ b/R/sp.average.R @@ -0,0 +1,142 @@ +#' @name sp.average +#' @title speciate data averaging functions +#' @aliases sp_average_profile + + +#' @description Functions to build composite (re)SPECIATE profiles + + +#' @description \code{sp_average_profile} generates an average composite +#' of a supplied multi-profile \code{respeciate} object. +#' @param x A \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param code required character, the unique profile code to assign to the +#' average profile. +#' @param name character, the profile name to assign to the average +#' profile. If not supplied, this defaults to a collapsed list of the codes +#' of all the profiles averaged. +#' @param method numeric, the averaging method to apply: Currently only 1 (default) +#' \code{mean(x)}. +#' @param ... additional arguments, currently ignored +#' @return \code{sp_average_profile} returns a single profile average +#' version of the supplied \code{respeciate} profile. +#' @note In development function; arguments and outputs likely to be subject to +#' change. +#' +#' This is one of the very few \code{respeciate} functions that modifies the +#' \code{WEIGHT_PERCENT} column of the \code{respectiate} \code{data.frame}. + + +#NOTE + +#' @rdname sp.average +#' @export + +## #' @import data.table (in xxx.r) +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + +###################### +#average data +###################### + +## in development + +## to do + +## padding output - see in function notes + +## plot and output options + +## to think about + +## currently averaging using mean +## do we need to rescale before averaging because not all +## WEIGHT_PERCENT add up to 100 ?? + +## mean.respeciate +# could be a no plot, method = mean version of this function ?? + + +########################## +#issue +############################ + +########################### +#test +########################### + +#aa <- sp_profile(sp_find_profile("ae8", by="profile_type")) +#sp_average_profile(aa) + + +sp_average_profile <- function(x, code = NULL, name = NULL, method = 1, + ...){ + + ################################# + #check x is a respeciate object?? + + #check it has .value + x <- rsp_tidy_profile(x) + + #save class to return as is.. + # thinking about this + tmp <- class(x) + xx <- as.data.table(x) + + #save profiles + test <- unique(x$PROFILE_CODE) + + #extra.args - not sure if we are using these + .xargs <- list(...) + + #get profile terms if supplied + if(is.null(code)){ + stop("need a new profile code") + } + xx$PROFILE_CODE <- code + + xx$PROFILE_NAME <- if(is.null(name)){ + if(length(test)>10){ + "average of multiple cases" + } else { + paste("average of:", paste(test, collapse = ","), sep="") + } + } else { + name + } + + out <- xx[, + .(PROFILE_NAME = PROFILE_NAME[1], + SPECIES_NAME = SPECIES_NAME[1], + SPEC_MW = SPEC_MW[1], + .total = sum(.value, na.rm = TRUE), + .value = mean(.value, na.rm = TRUE), + .n = length(.value[!is.na(.value)]), + .sd = sd(.value, na.rm = TRUE) + ), + by=.(PROFILE_CODE, SPECIES_ID)] + #I said we would NOT do this... + out$WEIGHT_PERCENT <- out$.value + + ######################### + #pad missing info + # check method in sp_profile + # species info + # ignore profile and ref info + # because the user is builder... + ######################### + + #output + + ################ + #plot, report, etc + ################### + + out <- as.data.frame(out) + class(out) <- tmp + out + +} + + diff --git a/R/sp.cluster.R b/R/sp.cluster.R new file mode 100644 index 0000000..722ebbe --- /dev/null +++ b/R/sp.cluster.R @@ -0,0 +1,351 @@ +#' @name sp.cluster +#' @title sp_profile clustering +#' @aliases sp_profile_distance + +#' @description sp_profile functions for studying similarities (or +#' dissimilarities) within multi-profile (re)SPECIATE data sets + +#' @description \code{\link{sp_profile_distance}} calculates the statistical distance +#' between re(SPECIATE) profiles, and clusters profiles according to nearness. +#' @param x A \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param output Character vector, required function output: \code{'report'} the +#' calculated distance matrix; \code{'plot'} a heat map of that distance +#' matrix. +#' @note Please note: function in development; structure and arguments may be +#' subject to change. +#' @return Depending on the \code{output} option, \code{sp_profile_distance} returns +#' one or more of the following: the correlation matrix, a heat map of the +#' correlation matrix. + + +######################## +#to think about +######################## + +#think about other functions +# variations on current dist +# kmeans +# others + +# https://www.statology.org/k-means-clustering-in-r/ + +#NOTE + +#' @rdname sp.cluster +#' @export + +# using data.table for dcast + +# start build the code for the matching method +# and the dissim function, etc... + + +#needs a lot of work +# needs thinking through +# needs options/formals + +#output like in sp_species_cor + +#also check through and consider other options in sp_profile_cor + +#currently tracking + +#think about how we handle too-big matrices, e.g. +# aa <- sp_profile(sp_find_profile("ae6", by="profile_type")) +# sp_profile_distance(aa) + + + +#test +###################### + +#aa <- sp_profile(sp_find_profile("ae8", by="profile_type")) +#sp_profile_distance(aa) + + +sp_profile_distance <- function(x, output = c("plot", "report")){ + + #add .value if missing + x <- rsp_tidy_profile(x) + + # make by profile (rows) by species (columns) data.frame + # move profile_code to row.names for heatmap + .x <- sp_dcast(x, widen = "species") + .tmp <- .x[-1:-2] + row.names(.tmp) <- .x[,1] + + #dist calculation + #reset NAs to 0 + # that might not be best option... + .dst <- dist(scale(.tmp, center = TRUE, scale = TRUE)) + .dst[is.na(.dst)] <- 0 + + #currently not using these two... + # should drop or re-include + # (think this is a shortcut to similar when used with image) + .clst <- hclust(.dst, method = "average") + ord <- order(cutree(.clst, k = 3)) + + #print(ord) + #image(as.matrix(.dst)[ord, ord]) + .cop <- cophenetic(.clst) + #names(.dst) <- .x[,1] + #image(as.matrix(.cop)[ord, ord]) + + ####################### + #output + ####################### + + #think about + # could handle this like plot or + # using a heatmap and legend argument??? + # + if("plot" %in% output){ + heatmap(as.matrix(.cop), Rowv = FALSE, Colv=FALSE, scale="none") + } + if("report" %in% output){ + invisible(.cop) + } +} + +#check through below on similarity matrices notes + + + + + +##################### +#unexported +##################### + + +##from https://stackoverflow.com/questions/5639794/in-r-how-can-i-plot-a-similarity-matrix-like-a-block-graph-after-clustering-d + +## MASS is for mvrnorm +#require(MASS) +## seed sets the output +#set.seed(1) +#dat <- data.frame(mvrnorm(100, mu = c(2,6,3), +# Sigma = matrix(c(10, 2, 4, +# 2, 3, 0.5, +# 4, 0.5, 2), ncol = 3))) + +##Compute the dissimilarity matrix of the standardised data using Eucildean distances + +#dij <- dist(scale(dat, center = TRUE, scale = TRUE)) + +##and then calculate a hierarchical clustering of these data using the group average method + +#clust <- hclust(dij, method = "average") + +##Next we compute the ordering of the samples on basis of forming 3 ('k') groups from the dendrogram, but we could have chosen something else here. + +#ord <- order(cutree(clust, k = 3)) + +##Next compute the dissimilarities between samples based on dendrogram, the cophenetic distances: + +# coph <- cophenetic(clust) + +##Here are 3 image plots of: + +## The original dissimilarity matrix, sorted on basis of cluster analysis groupings, +##The cophenetic distances, again sorted as above +##The difference between the original dissimilarities and the cophenetic distances +##A Shepard-like plot comparing the original and cophenetic distances; the better the clustering at capturing the original distances the closer to the 1:1 line the points will lie +##Here is the code that produces the above plots + +#layout(matrix(1:4, ncol = 2)) +#image(as.matrix(dij)[ord, ord], main = "Original distances") +#image(as.matrix(coph)[ord, ord], main = "Cophenetic distances") +#image((as.matrix(coph) - as.matrix(dij))[ord, ord], +# main = "Cophenetic - Original") +#plot(coph ~ dij, ylab = "Cophenetic distances", xlab = "Original distances", +# main = "Shepard Plot") +#abline(0,1, col = "red") +#box() +#layout(1) + +#and testing + +#might be something in here!! + +##looks like I was using ae8 and ae6 +## so 6 and 219 profiles +## full data base is 6845 +## so be useful +#a <- sp_profile.2(find_sp_profile("pm-ae8", by="profile_type")) +#a <- sp_profile.2(find_sp_profile("pm-ae6", by="profile_type")) + +#dat <- data.frame(MASS::mvrnorm(100, mu = c(2,6,3), +# Sigma = matrix(c(10, 2, 4, +# 2, 3, 0.5, +# 4, 0.5, 2), ncol = 3))) +#dat +#dij <- dist(scale(dat, center = TRUE, scale = TRUE)) +#image(dij, main = "Original distances") +#image(as.matrix(dij), main = "Original distances") +#dij +#image(as.matrix(dij[order(dij$V1),]), main = "Original distances") +#dij +#a <- sp_profile.2(find_sp_profile("pm-ae8", by="profile_type")) +#a +#b <- test_wide(a) +#head(b) +#head(b[-1:-2]) +#bb <- b[-1:-2]) +#bb <- b[-1:-2] +#dij <- dist(scale(bb, center = TRUE, scale = TRUE)) +#image(as.matrix(dij), main = "Original distances") +#heatmap(as.matrix(dij), main = "Original distances") +#b$PROFILE_NAME +#heatmap(as.matrix(dij)) +#?heatmap +#a <- sp_profile.2(find_sp_profile("pm-ae6", by="profile_type")) +#b <- test_wide(a) +#bb <- b[-1:-2] +#heatmap(as.matrix(dij)) +#dij <- dist(scale(bb, center = TRUE, scale = TRUE)) +#heatmap(as.matrix(dij)) +#clust <- hclust(dij, method = "average") +#?hclust +#class(dij) +#dij +#summary(dij) +#dij[is.na(dij=0)] +#dij[is.na(dij)] +#dij[is.na(dij)]<- 0 +#heatmap(as.matrix(dij)) +#clust <- hclust(dij, method = "average") +#ord <- order(cutree(clust, k = 3)) +#heatmap(as.matrix(dij)[ord, ord]) +#coph <- cophenetic(clust) +#image(as.matrix(coph)[ord, ord], main = "Cophenetic distances") +#image(as.matrix(dij)[ord, ord]) +#ord <- order(cutree(clust, k = 7)) +#coph <- cophenetic(clust) +#image(as.matrix(dij)[ord, ord]) +#image(as.matrix(coph)[ord, ord], main = "Cophenetic distances") +#image((as.matrix(coph) - as.matrix(dij))[ord, ord], +# main = "Cophenetic - Original") +#plot(coph ~ dij, ylab = "Cophenetic distances", xlab = "Original distances", +# main = "Shepard Plot") +#abline(0,1, col = "red") +#dij <- dist(scale(bb, center = TRUE, scale = TRUE)) +#summary(dij) +#dij[is.na(dij)] +#as.vector(dij) +#dij[is.na(dij)] <- (max(dij, na.rm=TRUE)*5) +#ord <- order(cutree(clust, k = 7)) +#clust <- hclust(dij, method = "average") +#image(as.matrix(coph)[ord, ord], main = "Cophenetic distances") +#image(as.matrix(dij)[ord, ord], main = "Cophenetic distances") +#image(as.matrix(dij), main = "Cophenetic distances") +#dij <- dist(scale(bb, center = TRUE, scale = TRUE)) +#dij[is.na(dij)] <- (max(dij, na.rm=TRUE)*1) +#clust <- hclust(dij, method = "average") +#ord <- order(cutree(clust, k = 7)) +#image(as.matrix(dij)[ord, ord], main = "Cophenetic distances") +#image(as.matrix(dij), main = "Cophenetic distances") +#heatmap(dij) +#heatmap(as.matrix(dij)) +#?heatmap +#max(dij) +#which(max==max(dij)) +#which(max=max(dij)) +#where(max=max(dij)) +#?which +#where(max~max(dij)) +#which(max~max(dij)) +#which(dij==max(dij)) +#summary(dij) +#?dist +#dij[order(dij)] +#dij[order(dij)][1:10] +#a <- sp_profile.2(find_sp_profile("pm-ae8", by="profile_type")) +#b <- test_wide(a) +#bb <- b[-1:-2] +#dij <- dist(scale(bb, center = TRUE, scale = TRUE)) +#plot(dij) +#plot(as.matrix(dij)) +#heatmap(as.matrix(dij)) +#dij +#cor(bb) +#?cor +#cor(bb, use="pairwise.complete.obs") +#heapmap(cor(bb, use="pairwise.complete.obs")) +#heatmap(cor(bb, use="pairwise.complete.obs")) +#a <- sp_profile.2(find_sp_profile("pm-ae8", by="profile_type")) +#b <- test_wide(a) +#bb <- b[-1:-2] +#heatmap(cor(bb, use="pairwise.complete.obs")) +#cor(bb, use="pairwise.complete.obs") +#plot(cor(bb, use="pairwise.complete.obs")) +#is(cor(bb, use="pairwise.complete.obs")) +#o <- cor(bb, use="pairwise.complete.obs") +#o[is.na(o)] <- -2 +#heatmap(o) +#o[o== -2] <- -4 +#heatmap(o) +#o[o== -4] <- -1 +#heatmap(o) +#?heatmap +#heatmap(o, Rowv = FALSE) +#heatmap(o, Rowv = FALSE, Colv=FALSE) +#o +#View(o) +#o[o== -1] <- 0 +#heatmap(o, Rowv = FALSE, Colv=FALSE) +#heatmap(o, Rowv = FALSE, Colv=FALSE, scale="none") +#heatmap(o, scale="none") +#heatmap(dij, scale="none") +#heatmap(as.matrix(dij), scale="none") +#a +#b +#apply(b, 1, function(x) {length(x[!na(x)])}) +#apply(b, 1, function(x) {length(x[!is.na(x)])}) +#apply(b, 2, function(x) {length(x[!is.na(x)])}) +#b$Nickel +#?cor +#longley +#cor(longley) +#heatplot(cor(longley), scale="none") +#heatmap(cor(longley), scale="none") +#symnum(cor(longley)) +#b +#apply(b, 2, function(x) {length(x[!is.na(x)])}) +#oo <- b[apply(b, 2, function(x) {length(x[!is.na(x)])})>3] +#oo +#cor(oo) +#o <- cor(oo, use="pairwise.complete.obs") +#o <- cor(oo[(-1:-2)], use="pairwise.complete.obs") +#heatmap(o, scale="none") +#o +#heatmap(o^2, scale="none") +#heatmap(o, scale="none") +#find_sp_species("Ethanol") +#find_sp_profile("Ethanol", by="species_name") +#find_sp_profile("Ethanol", by="species_name", partial=FALSE) +#find_sp_profile("Ethanne", by="species_name", partial=FALSE) +#find_sp_profile("Ethane", by="species_name", partial=FALSE) +#pr <- sp_profile(find_sp_profile("Ethane", by="species_name", partial=FALSE)) +#pr +#summary(factor(pr$PROFILE_NAME)) +#summary(factor(pr$PROFILE_TYPE)) +#summary(factor(pr$SPECIES_NAME)) +#pr$SPECIES_NAME=="Ethane" +#pr[pr$SPECIES_NAME=="Ethane",] +#pr[pr$SPECIES_NAME=="Ethane",] +#plot(pr[pr$SPECIES_NAME=="Ethane",]$WEIGHT_PERCENT) +#summary(factor(pr$PROFILE_NAME)) +#.pr <- pr[pr$SPECIES_NAME=="Ethane",] +#summary(factor(.pr$PROFILE_NAME)) +#summary(factor(.pr$PROFILE_CODE)) +#.pr <- .pr[.pr$PROFILE_CODE=="0296",] +#.pr +#as.data.frame(.pr) +#View(.pr) +#sysdata$REFERENCES +#test2(.pr) +#test_wide(.pr) +#pr diff --git a/R/sp.cor.R b/R/sp.cor.R new file mode 100644 index 0000000..8c14e9c --- /dev/null +++ b/R/sp.cor.R @@ -0,0 +1,238 @@ +#' @name sp.cor +#' @title (re)SPECIATE Species Correlations +#' @aliases sp_species_cor + +#' @description sp_species functions for studying relationships between +#' species in multi-profile (re)SPECIATE data sets. + +#' @description \code{\link{sp_species_cor}} generates a by-species correlation +#' matrix of the supplied (re)SPECIATE data sets. +#' @param x \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param min.n \code{numeric} (default 3), the minimum number of species measurements +#' needed in a profile for the function to use it in correlation calculations. +#' Here, it should be noted that this does not guarantee the three matched +#' pairs of measurements needed to calculate a correlation coefficient because +#' not all profiles contain all species, so there may still be insufficient +#' overlap on a case-by-case basis. +#' @param cols a series of \code{numeric}, \code{character} or other class values +#' that can be translated into a color gradient, used to color valid cases when +#' generating plots and color keys, default \code{c("#80FFFF", "#FFFFFF", "#FF80FF")} +#' equivalent to \code{\link{cm.colors}} output. +#' @param na.col \code{numeric}, \code{character} or other class that can be +#' translated into a single color, used to color \code{NA}s when generating +#' plots and color keys, default grey \code{"#CFCFCF"}. +#' @param heatmap.args \code{logical} or \code{list}, heat map settings. Options +#' include \code{TRUE} (default) to generate the heat map without modification; +#' \code{FALSE} to not plot it; or a list of heat map options to alter the plot +#' default appearance. The plot, a standard heat map with the dendrograms +#' removed, is generated using \code{\link[stats]{heatmap}}, so see associated +#' documentation for valid options. +#' @param key.args \code{logical} or \code{list}, color key settings if plotting +#' the correlation matrix heat map. Options include \code{TRUE} (default) to +#' generate the key without modification; \code{FALSE} to not include the key; +#' or a list of options to alter the key appearance. +#' @param report \code{logical} or \code{character}, the required function +#' output. Options include: \code{'silent'} (default), to return the +#' correlation matrix invisibly; \code{TRUE} to return the matrix +#' (visibly); and, \code{FALSE} to not return it. +#' @return By default \code{sp_species_cor} invisibly returns the calculated +#' correlation matrix a plots it as a heat map, but arguments including +#' \code{heatmap} and \code{report} can be used to modify function outputs. + + +#NOTE + +# provisional list options done for heatmap.args and key.args!!! +# but will need tidying +# PLUS when done maybe for make local function so same +# can be used in other similar functions + +#' @rdname sp.cor +#' @export + +# using data.table for dcast + +###################### +#cor +#generate correlation matrix +###################### + +######################## +#in development +####################### + +#to think about +######################### + +#speed up the correlation calculation +# currently painful using for loop +# can't use stats::cor on whole data set and include min.n option +# but there should be a way to make this a lot faster and retain that... + +#formals for cor calculation control +# include cor:use, etc??? + + +#plot handling could be tidier +# at moment plot handling is in an if/else +# with/without NAs handled differently +# but think this could be passed down into key and plot +# holding for now until key is in other plots +# and have an idea how it'll work +# (see also key) + +#col key +# currently using local function rsp_col_key +# could export or more to another package but would added to depends + +#plot and col key fine control +# suspecting will need better fine control of both +# re plot +# font size, heatplot options +# re col key +# font size, scale position, style/type, range +# currently holding (see plot handling and col key) + +#starting thinking about above +# see output control below + +#output control +# currently uses lots of settings: +# heatmap TRUE/FALSE to plot heatmap of correlation matrix +# or list of heatmap plot settings +# key TRUE/FALSE to add a col key to the heatmap +# or list of col key plot settings +# report "silent" to return the correlation matrix invisibly, +# or TRUE/FALSE to return it visibly or not at all. +# The list options are not yet done... + + +#aa <- sp_profile(sp_find_profile("ae8", by="profile_type")) +#sp_species_cor(aa) + +sp_species_cor <- function(x, min.n = 3, + cols = c("#80FFFF", "#FFFFFF", "#FF80FF"), + na.col = "#CFCFCF", heatmap.args = TRUE, + key.args = TRUE, report = "silent"){ + #if ref missing + .x <- sp_dcast(x, widen="species") + + #no point doing any have less than min.n values? + .test <- apply(.x, 2, function(x) length(.x[!is.na(x)])) + .x <- .x[.test > min.n] + + #any point doing any with only 1 unique value?? + + ############################# + #calculate correlations + ############################# + + #Could not stop bad cors using... + #.cor <- cor(.x[-1:-2], use="pairwise.complete.obs") + + #sd = 0 warnings? + # maybe cases where 1 unique case, e.g. x=c(1,1,1,1,1), y=c(2,2,2,2) + # or where overlap is low <2 x=c(1,1,1,NA,NA), y=c(NA,NA,1,NA,NA) + + #I'm calling this the pokemon cludge... + #got to catch 'em all... + f <- function(x, y) { + test <- !is.na(x) & !is.na(y) + x <- x[test] + y <- y[test] + if(length(x) >= min.n){ + if(length(unique(x))==1 || length(unique(y))==1){ + NA + } else { + cor(x,y, use="pairwise.complete.obs") + } + } else { NA } + } + + .X <- .x[-1:-2] + .cor <- as.data.frame(matrix(NA, nrow=ncol(.X), ncol=ncol(.X))) + for(i in 1:ncol(.cor)) { + for(j in 1:ncol(.cor)) { + .cor[i, j] <- f(.X[, i], .X[, j]) # apply f() to each combination + } + } + row.names(.cor) <- colnames(.cor) <- colnames(.X) + + #cheats to test plot output + #base r graphics is getting painful + + #.cor[.cor>0.5 & .cor<1] <- 0.5 + #.cor[is.na(.cor)] <- -0.5 + #.cor <- .cor/2 + + #NB: plot is heatmap without the dendrograms + #stackover suggests it is hard going modifying + #https://stackoverflow.com/questions/29893630/r-draw-heatmap-with-clusters-but-hide-dendrogram + + if((is.logical(heatmap.args) && heatmap.args) | (is.list(heatmap.args))){ + .cols.max <- 300 + if(max(.cor, na.rm=TRUE) < 1){ + .z <- (1 - max(.cor, na.rm=TRUE))*100 + .cols.max <- .cols.max - .z + } + .tmp <- .cor + .tmp[is.na(.tmp)] <- -2 + cols <- c(rep(na.col, 100), colorRampPalette(cols)(200)) + if(any(is.na(.cor))){ + #plot when + #.cor includes nas + .hm.cols <- cols[1:.cols.max] + .k.na <- na.col + + } else { + #plot when + #.cor DOES NOT includes nas + .cols.min <- 101 + if(min(.cor, na.rm=TRUE) < 1){ + .z <- (1 + min(.cor, na.rm=TRUE))*100 + .cols.min <- .cols.min + .z + #print(.cols.min) + } + .hm.cols <- cols[.cols.min:.cols.max] + .k.na <- NA + } + .hm <- list(x = as.matrix(.tmp), Rowv = NA, Colv = NA, + col = .hm.cols, scale = "none") + if(is.list(heatmap.args)){ + .hm[names(heatmap.args)] <- heatmap.args + } + do.call(heatmap, .hm) + if((is.logical(key.args) && key.args) | (is.list(key.args))){ + .k <- list(key = c(-1,1), cols=cols[101:300], + #type=1, + ticks= -1:1, y=0.75, na.col = .k.na) + if(is.list(key.args)){ + .k[names(key.args)] <- key.args + } + do.call(rsp_col_key, .k) + } + } + + #report handling + + #output + #################### + #currently aiming for + # report (invisible), plot or both + # report class data.frame + # output plot only does not mean anything... + + if((is.logical(report) && report) | (is.character(report))){ + if(is.character(report) && report=="silent"){ + return(invisible(.cor)) + } else { + return(.cor) + } + } else { + return(invisible(NULL)) + } + +} + + diff --git a/R/sp.info.R b/R/sp.info.R new file mode 100644 index 0000000..62f4b79 --- /dev/null +++ b/R/sp.info.R @@ -0,0 +1,174 @@ +#' @name sp.info +#' @title re(SPECIATE) information +#' @aliases sp_info sp_profile_info sp_species_info sp_find_profile +#' sp_find_species + +########################### +#keep think about the names +########################### +# sp_profile_info used to be sp_find_profile +# sp_species_info used to be sp_find_species +# both sp_find_ functions are currently sp_info_ wrappers +# should remove at some point??? + +######################### +#to think about +######################### + +# + + +#' @description Functions that provide (re)SPECIATE +#' source information. +#' \code{sp_info} generates a brief version report for the currently installed +#' (re)SPECIATE data sets. +#' \code{sp_profile_info} searches the currently installed (re)SPECIATE +#' data sets for profile records. +#' \code{sp_species_info} searches the currently installed (re)SPECIATE +#' data sets for species records. + +#' @param ... character(s), any search term(s) to use when searching +#' the local (re)SPECIATE archive for relevant records using +#' \code{sp_profile_info} or \code{sp_species_info}. +#' @param by character, the section of the archive to +#' search, by default \code{'keywords'} for \code{sp_profile_info} and +#' \code{'species_names'} for \code{sp_species_info}. +#' @param partial logical, if \code{TRUE} (default) +#' \code{sp_profile_info} or \code{sp_profile_info} use partial matching. + +#' @return \code{sp_info} provides a brief version information report on the +#' currently installed (re)SPECIATE archive. +#' @return \code{sp_profile_info} returns a \code{data.frame} of +#' profile information, as a \code{respeciate} object. +#' \code{sp_species_info} returns a \code{data.frame} of +#' species information as a \code{respeciate} object. + +#' @examples \dontrun{ +#' profile <- "Ethanol" +#' pr <- sp_find_profile(profile) +#' pr +#' +#' species <- "Ethanol" +#' sp <- sp_find_species(species) +#' sp} +#' + +#might want to replace this with example that +# finds profile containing ethanol? + + +####################### +#sp_info +####################### + +# tidy output??? +# little messy??? + +#like to include summary(factor(sysdata$PROFILES$PROFILE_TYPE)) +# or something like that + +# this is not currently catchable!!!! +# a <- sp_info() #a = NULL + +#' @rdname sp.info +#' @export + +sp_info <- function() { + #extract profile info from archive + .ver <- "source: SPECIATE 5.2\n\t[in (re)SPECIATE since 0.2.0]" + .pro <- length(unique(sysdata$PROFILES$PROFILE_CODE)) + .spc <- length(unique(sysdata$SPECIES_PROPERTIES$SPECIES_ID)) + cat(.ver, "\n\tProfiles: ", .pro, "\n\tSpecies: ", .spc, sep = "") + +} + + +#' @rdname sp.info +#' @export + +sp_profile_info <- function(..., by = "keywords", partial = TRUE) { + #extract profile info from archive + out <- sysdata$PROFILES + terms <- c(...) + ################################### + #ignoring case because missing loads... + # might want to think about spaces as well??? + #currently same in sp_find_species + # should think about using common code??? + # also error messaging if by is not known??? + ################################### + #how to handle searching a profile + #that contains a specific species + # this is all the profiles that have an entry for species_id 529 + # sysdata$SPECIES$PROFILE_CODE[sysdata$SPECIES$SPECIES_ID==529] + # could use sp_find_species to get species info, then + ################################### + if(tolower(by) %in% c("species_name")){ + ############################ + #special case + #search by species_name + # could add species_id, cas, etc? + ############################### + species <- sysdata$SPECIES + ref <- out$PROFILE_CODE + for(ti in terms){ + ans <- sp_find_species(ti, by=by, partial=partial) + terms <- species$PROFILE_CODE[species$SPECIES_ID %in% ans$SPECIES_ID] + ref <- ref[ref %in% terms] + } + out <- out[out$PROFILE_CODE %in% ref,] + } else { + for(ti in terms){ + ref <- out[,tolower(names(out))==by] + if(nrow(out)>0){ + out <- if(partial){ + out <- out[grep(ti, ref, ignore.case = TRUE),] + } else { + out <- out[tolower(ti)==tolower(ref),] + } + } + } + } + out <- rsp_build_respeciate(out) + return(out) +} + +#' @rdname sp.info +#' @export + +#wrapper for above + +sp_find_profile <- function(...){ + sp_profile_info(...) +} + +#' @rdname sp.info +#' @export + +sp_species_info <- function(..., by = "species_name", partial = TRUE) { + #extract species info from archive + out <- sysdata$SPECIES_PROPERTIES + terms <- c(...) + for(ti in terms){ + ref <- out[,tolower(names(out))==by] + if(nrow(out)>0){ + out <- if(partial){ + out <- out[grep(ti, ref, ignore.case = TRUE),] + } else { + out <- out[tolower(ti)==tolower(ref),] + } + } + } + #out <- PROFILES[grep(term, PROFILES[[by]], ignore.case = TRUE), ] + out <- rsp_build_respeciate(out) + return(out) +} + +#' @rdname sp.info +#' @export + +#wrapper for above + +sp_find_species <- function(...){ + sp_species_info(...) +} diff --git a/R/sp.match.R b/R/sp.match.R new file mode 100644 index 0000000..653c415 --- /dev/null +++ b/R/sp.match.R @@ -0,0 +1,323 @@ +#' @name sp.match +#' @title Find nearest matches from reference set of profiles +#' @aliases sp_match_profile + +#' @description \code{sp_match_profile} compares a supplied species +#' (re)SPECIATE profile and a reference set of supplied profiles and +#' attempt to identify nearest matches on the basis of correlation +#' coefficient. +#' @param x A \code{respeciate} object or similar \code{data.frame} containing +#' a species profile to be compared with profiles in \code{ref}. If \code{x} +#' contains more than one profile, these are averaged (using +#' \code{\link{sp_average_profile}}), and the average compared. +#' @param ref A \code{respeciate} object, a \code{data.frame} containing a +#' multiple species profiles, to be used as reference library when identifying +#' nearest matches for \code{x}. +#' @param matches Numeric (default 10), the maximum number of profile matches to +#' report. +#' @param rescale Numeric (default 5), the data scaling method to apply before +#' comparing \code{x} and profiles in \code{ref}: options 0 to 5 handled by +#' \code{\link{sp_rescale}}. +#' @param min.n \code{numeric} (default 8), the minimum number of paired +#' species measurements in two profiles required for a match to be assessed. +#' See also \code{\link{sp_species_cor}}. +#' @param test.x Logical (default FALSE). The match process self-tests by adding +#' \code{x} to \code{ref}, which should generate a perfect fit=1 score. Setting +#' \code{test.x} to \code{TRUE} retains this as an extra record. +#' @return \code{sp_match_profile} returns a fit report: a \code{data.frame} of +#' up to \code{n} fit reports for the nearest matches to \code{x} from the +#' reference profile data set, \code{ref}. + +#NOTE + +#' @rdname sp.match +#' @export + +###################### +#match +#find ref profile 'most similar to x +###################### + +# the sp_dcast uses data.table +# sp_match uses rbindlist from data.table + +#in development + +#to do +######################### + +##aa <- sp_profile(sp_find_profile("composite", by="profile_name")) +##sp_match_profile(sp_profile("41220C"), aa) +##assuming 41220C exists + +##NOTE sp_profile code is case sensitive +## not sure if we can fix this?? + + +#to think about +######################### + +#match is giving warning +# In min(WEIGHT_PERCENT, na.rm = TRUE) : +# no non-missing arguments to min; returning Inf +# when (I guess) nothing there to compare... + +#default for ref +# using sp_profile(sp_find_profile("composite", by="profile_name")) +# in example. + +#could add error if x is more than one profile +# could use sp_profile_mean when written if we want to force to one +# one profile [?? nb: that function name not decided yet] + +#do we want an output option? +# have a basic report table but could + +#option to exclude test? from report + +#how can users make x if not from (re)SPECIATE archive +# currently using rsp_ [code after this function] +# to anonymise a speciate profile +# suggestion: +# identify needed columns, formats and names +# maybe can build from at least: +# species_name and species_id +# weight_percent or .value + +#renaming appears to trip up when profile_name/id are factors +# current fix is to force .tmp.pf.cd and .tmp.pf.nm to character +# when extracting them... +# maybe think about better? + +#at the moment we error (stop()) function if no correlation outputs +# so far this appears to be mostly because number of species in x +# is less than min.bin for correlation. +# 1. this is currently hard-coded +# 2. not really looked at this in detail... +# could return a NULL and warning instead? +# could also do this earlier if min.bin set in formals +# but might need to rethink n, min.bin, etc??? + +sp_match_profile <- function(x, ref, matches=10, rescale=5, + min.n=8, test.x=FALSE){ + + ####################### + #if ref missing + ################## + #to do + # using sp_profile(sp_find_profile("composite", by="profile_name")) + # looked promising + + #add .value if not there + x <- rsp_tidy_profile(x) + + #tidy x for testing + #.x.pr.cd <- as.character(x$PROFILE_CODE) + #.x.pr.nm <- as.character(x$PROFILE_NAME) + + #note + # assuming only one profile + # might think about changing this in future + + if(length(unique(x$PROFILE_CODE))>1){ + x <- sp_average_profile(x, code = "test") + } else { + x <- sp_average_profile(x, code = "test", + name = paste("test>", x$PROFILE_NAME[1], sep="")) + } + + #note + # dropped the force option + # to override min.n if < min.n species in x + #if(length(unique(x$SPECIES_ID)) < min.n & force){ + # min.n <- length(unique(x$SPECIES_ID)) + #} + + ############### + #do test anyway + ############### + #if(test.x){ + matches <- matches + 1 + #} + + x <- as.data.table(x) + ref <- as.data.table(ref) + .tmp <- rbindlist(list(x, ref), fill=TRUE) + + ################# + #think about this + ################# + + #no normalisation option is method = 0 (via sp_profile_rescale) + + #note: think about calling this rescale because it is the rescale method + # (not match method) we are setting + + #rescale data + ################################ + #note: currently outputting data.frame + # (idea is we are not restricting the user to data.table) + # (but it means we have to reset and I am guessing there is + # a time/memory penalty for that??) + + #might handle sp_rescale_species in/output + # either returns object in class it is given + # or errors if not respeciate + # or errors if not respeciate unless forced + # but then maybe need to check requires + # cols are there??? + + .tmp <- as.data.table(sp_rescale_species(.tmp, method=rescale)) + + ################### + #keep species names and ids for renaming + # as.character to stop rename tripping on factors + # not ideal... + ################### + .tmp.pr.nm <- as.character(.tmp$PROFILE_NAME) + .tmp.pr.cd <- as.character(.tmp$PROFILE_CODE) + + + ################## + #dcast to reshape for search + # cols (profiles), rows(species) + ################## + + #testing: using sp_profile_dcast + # code is set up for sp_profile_dcast(.tmp, widen="profile") + + ##previous code + #.tmp <- dcast(.tmp, + # SPECIES_ID + SPECIES_NAME ~PROFILE_CODE, + # mean, + # na.rm=TRUE, + # value.var = ".value") + + .tmp <- as.data.table(sp_dcast(.tmp, widen="profile")) + + #nb: need the as.data.table() because sp_profile_dcast + # currently returns data.frame + # we are using data.table but I am trying to avoid forcing + # users into doing the same... + + #to think about: + # an as.is option in functions to made outputs same as inputs? + # disable the 'as is' assumption in previous code + # BUT now not sure why... + + #ignore first two columns + # species_id and species_name + .cols <- names(.tmp)[3:(ncol(.tmp))] + + #get x back as a test case... + #need a better way to handle test + .test <- .tmp[, "test"] + + ########################### + #fit term + # currently using conventional regression coefficient + + #nb: we see a warning + # think it is just cases when no variance in the xy data + # so only one pair or (all xs same and all ys sames)??? + + ######################### + #min n + ######################### + #to do + # compare this and code in sp_species_cor + # if/when we deal with this stop message this code may need to be updated + + f <- function(x) { + if(length(x[!is.na(x) & !is.na(.test)])>min.n){ + suppressWarnings(cor(x, .test, use ="pairwise.complete.obs")) + } else { + NA + } + } + .out <- .tmp[, (.cols) := lapply(.SD, f), .SDcols = .cols] + + ########################## + #chop down to build report + ########################## + + #wonder if above is non-ideal?? + # rows are replicates + # might be a better way of doing this??? + + .out <- as.data.frame(.out[1, -1:-2]) + .out <- sort(unlist(.out), decreasing = TRUE) + if(length(.out) > matches){ + .out <- .out[1:matches] + } + + #could be better way to do next bit + #just making a lookup for profile info + + #this is a pain because some profile names are replicated + # (several profile codes seem to have same profile name) + + if(length(.out)<1){ + #see notes.... + #sometimes this is because there are less than min.n species in the x profile + stop("sp_match_profile> No (", min.n, " point) matches for x", call. = FALSE) + } + + .tmp <- names(.out) + for(i in 1:length(.tmp)){ + if(.tmp[i] %in% .tmp.pr.cd){ + .tmp[i] <- .tmp.pr.nm[.tmp.pr.cd == names(.out)[i]][1] + } + } + + #row.names fulled form .out if you don't overwrite + + .out <- data.frame(PROFILE_CODE=names(.out), + PROFILE_NAME=.tmp, + fit=.out, + row.names = 1:length(.out)) + + #conflicted!!! + if(!test.x){ + matches <- matches - 1 + if("test" %in% x$PROFILE_CODE){ + .out <- .out[tolower(.out$PROFILE_CODE)!="test",] + } + if(nrow(.out) > (matches)){ + .out <- .out[1:matches,] + } + } + + + + ####################### + #output + ####################### + + #think about options? + rownames(.out) <- NULL + return(as.data.frame(.out)) +} + + + + + +#need something to replace this that helps users build local profiles + +#basic build needs +# profile_name and profile_code +# species_name and species_id +# weight_percent (and possibly .value) + +rsp_ <- function(x){ + .o <- sp_profile(x) + .o$PROFILE_NAME <- paste("test", .o$PROFILE_NAME, sep=">") + .o$PROFILE_CODE <- "test" + .o +} + + + + + diff --git a/R/sp.pad.R b/R/sp.pad.R new file mode 100644 index 0000000..2708923 --- /dev/null +++ b/R/sp.pad.R @@ -0,0 +1,228 @@ +#' @name sp.pad +#' @title (re)SPECIATE profile padding functions +#' @aliases sp_pad + +#' @description Functions for padding \code{respeciate} objects. + +#' @description \code{sp_pad} pads a supplied (re)SPECIATE profile data set +#' with profile and species meta-data. +#' @param x A \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param pad character, type of meta data padding, current options +#' \code{'profile'}, \code{'species'}, \code{'weight'}, \code{reference}, +#' \code{'standard'} (default; all but \code{'reference'}), and \code{'all'} +#' ('all'). +#' @param drop.nas logical, discard any rows where \code{WEIGHT_PERCENT} is +#' \code{NA}, default \code{TRUE}. +#' @return \code{sp_pad} returns \code{x}, with requested additional profile +#' and species meta-data added as additional \code{data.frame} columns. +#' See Note. +#' @note Some data handling can remove (re)SPECIATE meta-data, +#' and \code{sp_pad}s provide a quick rebuild/repair. For example, +#' \code{\link{sp_dcast}}ing to a (by-species or by-profile) widened +#' form strips some meta-data, and padding is used as part of the +#' \code{\link{sp_melt_wide}} and padding is used to re-add this meta-data +#' when returning the data set to its standard long form. + +#NOTE + +#' @rdname sp.pad +#' @export +## #' @import data.table +# now done in xxx.r + +#in development + +## think about + +# improving the respeciate class handling +# in -> dcast -> melt -> out + +#I do not think this needs the pad argument!!!! +# or we add an 'all' default??? +# I think it just needs to check with common columns +# and merge by those +# testing this with current version of sp_pad +# true default is sp_pad(x, pad="standard") +# SO refs would be lost with dcast then melt +# could allow pad as logical or character + + +# then a reset function would just remove a column and re-pad??? + +# might want to specify data.frame::as.data.table??? + +#think about a sp_repair_weight function + +#not sure we need drop.nas here... +# think melt is making them... + +#does this work for species_id and profile_name? +#should it.... + +#need to think about respeciate object class +#class have +# "respeciate", ".rsp" +# "respeciate", ".rsp.wp" +# "respeciate", ".rsp.ws" +# "respeciate", ".rsp.pi" +# "respeciate", ".rsp.si" +# with the second not being defined??? +# assuming this does cause a build problem +# it would just be an identifier for the respeciate object type +# standard (long), wide by profile, wide by species, +# profile info, species info... + + +#test +#b <- sp_dcast_species(spq_pm.ae8()) +#c <- b[c("PROFILE_CODE", "SPECIES_NAME", "WEIGHT_PERCENT")] +#x <- sp_pad(c) + + +sp_pad <- function(x, pad = "standard", drop.nas = TRUE){ + + + #should pad allow TRUE/FALSE??? + #should argument within sp_pad be method?? + + #tidy x + x <- rsp_tidy_profile(x) + #save class + .cls <- class(x) + out <- as.data.table(x) + + #profile + if(any(c("profile", "profiles", "standard", "all") %in% tolower(pad))){ + PROFILES <- as.data.table(sysdata$PROFILES) + .tmp <- intersect(names(out), names(PROFILES)) + if(length(.tmp)>0){ + out <- merge(out, PROFILES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + } + + #species + if(any(c("species", "standard", "all") %in% tolower(pad))){ + SPECIES_PROPERTIES <- as.data.table(sysdata$SPECIES_PROPERTIES) + .tmp <- intersect(names(out), names(SPECIES_PROPERTIES)) + if(length(.tmp) >0){ + out <- merge(out, SPECIES_PROPERTIES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + } + + #species weights + if(any(c("weight", "weights", "standard", "all") %in% tolower(pad))){ + SPECIES <- as.data.table(sysdata$SPECIES) + .tmp <- intersect(names(out), names(SPECIES)) + if(length(.tmp) >0){ + out <- merge(out, SPECIES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + } + + #references + if(any(c("reference", "references", "all") %in% tolower(pad))){ + PROFILE_REFERENCE <- as.data.table(sysdata$PROFILE_REFERENCE) + REFERENCES <- as.data.table(sysdata$REFERENCES) + .tmp <- intersect(names(out), names(PROFILE_REFERENCE)) + if(length(.tmp) >0){ + out <- merge(out, PROFILE_REFERENCE, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + .tmp <- intersect(names(out), names(REFERENCES)) + if(length(.tmp) >0){ + out <- merge(out, REFERENCES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + } + + #drop.nas. + if(drop.nas){ + out <- out[!is.na(out$WEIGHT_PERCENT),] + } + + #sp_profile reorders + # not sure if it is a good idea but could add as option + # here would be + # out <- out[order(out$PROFILE_CODE, decreasing = FALSE),] + + #not sure how to handle output... + # could return as input class + # see notes + out <- as.data.frame(out) + rsp_build_respeciate(out) + +} + + + +############################ +#not exporting +############################ + +#earlier versions + +#holding until testing on new code finished + +sp_pad.old <- function(x, pad = "species", drop.nas = TRUE){ + + #tidy x + x <- rsp_tidy_profile(x) + #save class + .cls <- class(x) + out <- as.data.table(x) + + #set up padding for melts... + .long <- "nothing" + if(pad=="species"){ + .long <- "SPECIES_NAME" + } + if(pad=="profile"){ + .long <- "PROFILE_CODE" + } + + PROFILES <- as.data.table(sysdata$PROFILES) + SPECIES_PROPERTIES <- as.data.table(sysdata$SPECIES_PROPERTIES) + if(.long=="PROFILE_CODE"){ + #add in profile then species info + out <- merge(out, PROFILES, by = .long, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + .tmp <- intersect(names(out), names(SPECIES_PROPERTIES)) + out <- merge(out, SPECIES_PROPERTIES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + if(.long=="SPECIES_NAME"){ + #add in species then profiles info + out <- merge(out, SPECIES_PROPERTIES, by = .long, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + .tmp <- intersect(names(out), names(PROFILES)) + out <- merge(out, PROFILES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } + #to get weight_percentage etc + if(pad %in% c("species", "profile", "weight")){ + SPECIES <- as.data.table(sysdata$SPECIES) + .tmp <- intersect(names(out), names(SPECIES)) + out <- merge(out, SPECIES, by = .tmp, all.y=FALSE, + all.x=TRUE, allow.cartesian=TRUE) + } else { + #not great but... + #if not padding WEIGHT_PERCENT has to be .value + out$WEIGHT_PERCENT <- out$.value + } + #drop.nas. + if(drop.nas){ + out <- out[!is.na(out$WEIGHT_PERCENT),] + } + + #not sure how to handle output... + #see notes + out <- as.data.frame(out) + rsp_build_respeciate(out) +} + + + + diff --git a/R/sp.pls.R b/R/sp.pls.R new file mode 100644 index 0000000..e72ffbb --- /dev/null +++ b/R/sp.pls.R @@ -0,0 +1,1045 @@ +#' @name sp.pls +#' @title (re)SPECIATE profile Positive Least Squares +#' @aliases sp_pls_profile pls_report pls_refit_species pls_fit_parent +#' pls_plot pls_plot_species pls_plot_profile + +#' @description Functions for Positive Least Squares (PSL) fitting of +#' (re)SPECIATE profiles + +#' @description +#' \code{sp_pls_profile} builds PSL models for supplied profile(s) using +#' the \code{\link{nls}} function, the 'port' algorithm and a lower +#' limit of zero for all model outputs to enforce the positive fits. The +#' modeled profiles are typically from an external source, e.g. a +#' measurement campaign, and are fit as a linear additive series of reference +#' profiles, here typically from (re)SPECIATE, to provide a measure of +#' source apportionment based on the assumption that the profiles in the +#' reference set are representative of the mix that make up the modeled +#' sample. The \code{pls_} functions work with \code{sp_pls_profile} +#' outputs, and are intended to be used when refining and analyzing +#' these PLS models. See + +#' @param x A \code{respeciate} object, a \code{data.frame} of +#' profiles in standard long form, intended for PLS modelling. +#' @param ref A \code{respeciate} object, a \code{data.frame} of +#' profiles also in standard long form, used as the set of candidate +#' source profiles when fitting \code{x}. +#' @param power A numeric, an additional factor to be added to +#' weightings when fitting the PLS model. This is applied in the form +#' \code{weight^power}, and increasing this, increases the relative +#' weighting of the more heavily weighted measurements. Values in the +#' range \code{1 - 2.5} are sometimes helpful. +#' @param ... additional arguments, currently ignored. +#' @param pls A \code{sp_pls_profile} output, only used by \code{pls_} +#' functions. +#' @param name Character class, for \code{pls_refit_species} only, the +#' name of the species to refit. All other species in the reference +#' profiles are held 'as is' and the \code{name}d species is refit based on +#' previous PLS estimated source contribution time-series, then these are +#' in turn refit using the new profiles generated by re-distributing the +#' \code{name}d species. +#' @param parent (for \code{pls_fit_parent} only) a data.frame of measurements +#' of an additional species. This is treated as the sample parent (or a proxy +#' for this), e.g. the total particulate matter (PM) measurement for +#' (percentage) PM profiles. It is added to \code{x}, dummy entries are added +#' to \code{ref} with a start value of 100 (percent), and the model is refitted. +#' @param start,lower,upper all numeric, the start value and lower and upper +#' boundaries when fitting a parameter. +#' @param species,profile (for \code{pls_plot}s only) numeric or character +#' identifying the species or profile to plot. If numeric, these are treated +#' as indices of the species or profile, respectively, in the PLS model; if +#' character, species is treated as the name of species and profile is treated +#' as the profile code. Both can be concatenation to product multiple plots and +#' the special case \code{-1} is a short cut to all species or profiles, +#' respectively. +#' @param type (for \code{pls_plot}s only) numeric, the plot type if +#' multiple options are available. +#' @param log (for \code{pls_plot_profile} only) logical, if \code{TRUE} this +#' applies 'log' scaling to the primary Y axes of the plot. + +######################### +# need to check terminology for this... +# The zero handling is a based on offset in plot(..., log="y", off.set) +# but automatically estimated... + +#' @return \code{sp_pls_profile} returns a list of nls models, one per +#' profile/measurement set in \code{x}. The \code{pls_} functions work with +#' these outputs. \code{pls_report} generates a \code{data.frame} of +#' model outputs, and is used of several of the other \code{pls_} +#' functions. \code{pls_refit_species} and \code{pls_fit_parent} +#' return the supplied \code{sp_pls_profile} output, updated on the +#' basis of the \code{pls_} function action. \code{pls_plot}s produce +#' various plots commonly used in source apportionment studies. + +#' @note This implementation of PLS applies the following modeling constraints: +#' +#' 1. The generate a model of \code{x} that is positively constained linear +#' product of the profiles in \code{ref}, so outputs can only be +#' zero or more. Although the model is generated using \code{\link{nls}}, +#' which is a Nonlinear Least Squares (NLS) model, the fitting term applied +#' in this case is linear. +#' +#' 2. The number of species in \code{x} must be more that the number of +#' profiles in \code{ref}, to reduce the likelihood of over-fitting. +#' +#' + +# GENERAL NOTES + +# TO DO +# link to CMB as crude form of CMB and reference? + +# these all need code tidying + +# check individual function notes + + +############################ +############################ +## sp_pls_profile +############################ +############################ + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +#This is version 2 + +#version 1 combined version2 and pls_report +#now separated because it simplified pls model reworking + +#currently keeping the function args +# might not need to do this BUT +# model does not seem to be tracking them when ... + +# check power handling is right + +sp_pls_profile <- function(x, ref, + power = 1, + ...){ + + ################## + #from rough code + ################## + + ######################## + #only allowing profiles < species + if(length(unique(ref$PROFILE_CODE)) >= length(unique(x$SPECIES_ID))){ + stop("sp_pls: need #.species > #.profiles, more species or less profiles??", + call. = FALSE) + } + + x.args <- list(...) + + #################### + #make sure we only have one species / profile + #################### + #tidying + .pr.cd <- unique(x$PROFILE_CODE) + ## .xx <- respeciate:::rsp_tidy_profile(x) + .xx <- lapply(.pr.cd, function(y){ + .x <- x[x$PROFILE_CODE==y,] + .x <- sp_average_profile(.x, y, .x$PROFILE_NAME[1]) + .x + }) + .xx <- data.table::rbindlist(.xx) + #should be same! redundant + .pr.cd <- unique(.xx$PROFILE_CODE) + + #################### + #reduce ref to just species in x + ################### + #no point to look at any species not in x + ref <- subset(ref, SPECIES_ID %in% unique(.xx$SPECIES_ID)) + + ################### + #nudge + ################### + #dropping nudge from version 2 + ## + #nb: method was nudge before analysis + #and a nudge back after + # nudge(identified.species)->pls->report->nudge back(identified.species) + + #if(!is.null(nudge)){ + # for(i in nudge){ + # #ref might have both WEIGHT_PERCENT and .value + # ref[ref$SPECIES_NAME==i, "WEIGHT_PERCENT"] <- + # ref[ref$SPECIES_NAME==i, "WEIGHT_PERCENT"] * 10 + # .xx[.xx$SPECIES_NAME==i, "WEIGHT_PERCENT"] <- + # .xx[.xx$SPECIES_NAME==i, "WEIGHT_PERCENT"] * 10 + # .xx[.xx$SPECIES_NAME==i, ".value"] <- + # .xx[.xx$SPECIES_NAME==i, ".value"] * 10 + # } + #} + + ############################## + #main step/ once per profile + ############################## + #can we replace this with data.table + ans <- lapply(.pr.cd, function(y){ + .test <- try({ + #need to try this because it does not always work + .x <- as.data.frame(.xx[.xx$PROFILE_CODE==y,]) + .x <- sp_average_profile(.x, "test", "1_test") + + #might not need one of this-and-same-above + #might be better doing it here... + .tmp <- subset(ref, ref$SPECIES_ID %in% unique(.x$SPECIES_ID)) + + #could change this with rbindlist version?? + .ref <- intersect(names(.x), names(.tmp)) + .out <- rbind(.x[.ref], .tmp[.ref]) + .out <- sp_dcast_profile(.out) + + #build formula and model args + .tmp <- names(.out) + .tmp <- .tmp[!.tmp %in% c("SPECIES_ID", "SPECIES_NAME", "test")] + #zero cases for port function + .ls <- paste("m_", .tmp, sep="") + .ls2 <- lapply(.ls, function(x){0}) + names(.ls2) <- .ls + .for <- paste("(m_", .tmp, "*`", .tmp, "`)", sep="", collapse = "+") + .for <- as.formula(paste("test~", .for)) + .wt <- 1/.out$test + ############################ + #note + ############################ + #nls wants lower and upper as vectors + #but seems to handle lists + # should check how this is done? + # might not translate sesnibly... + # pass upper, default INF??? + args <- list(formula = .for, + data=.out, + start=.ls2, + lower=.ls2, + weights=.wt, + algorithm="port", + control=nls.control(tol=1e-5)) + args <- modifyList(args, x.args[names(x.args) %in% names(args)]) + args$weights <- args$weights^power + x.args <- list(power=power) + + #run nls/pls + ##################### + mod <- do.call(nls, args) +# mod <- nls(.for, data=.out, +# weights = (1/.out$test)^power, # think about weighting +# start=.ls2, lower=.ls2, +# algorithm="port", +# control=nls.control(tol=1e-5) #think about tolerance +# ) + + #if we need to calculate AIC on a case-by-case basis... + #for model, I think we need to use stats:::logLik.nls for AIC calc... + #see + #https://stackoverflow.com/questions/39999456/aic-on-nls-on-r + #(currently calculating AIc on the lm model on the overall fit on + # all species in all profiles as part of pls_report) + + ################################### + #currently all-data stats in pls_report + # and returning list of models + ################################### + ##.tmp <- summary(mod)$coefficients + ##.p.mod <- .tmp[,4] + ##names(.p.mod) <- gsub("m_", "p_", names(.p.mod)) + ##.out <- data.frame(PROFILE_CODE = y, + ## t(.tmp[,1]), + ## t(.p.mod)) + ##.out + + #output list of mod + data + ################################ + #could add args? + # then drop power from pls_ function formals + # or allow as an overwrite only... + list(mod=mod, #model outputs + args=args, #model args + x.args=x.args) #rsp args + }, silent = TRUE) + if(class(.test)[1]=="try-error"){ + NULL + } else { + .test + } + }) + names(ans) <- .pr.cd + + #returns the list of nls models + #(assuming all viable, one per profile_code in x) + + #to think about object type??? + return(ans) + +} + + +############################# +############################# +## pls_report +############################# +############################# + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +# this is the model report table +# other pls_ functions use output +# so take care if changes + +pls_report <- function(pls){ + + ans <- lapply(names(pls), function(x){ + .xx <- pls[[x]] + .out <- .xx$args$data + .tmp <- summary(.xx$mod)$coefficients + .p.mod <- .tmp[,4] + names(.p.mod) <- gsub("m_", "p_", names(.p.mod)) + .out <- data.frame(PROFILE_CODE = x, + .out, + t(.tmp[,1]), + t(.p.mod), + pred = predict(.xx$mod, newdata=.xx$args$data), + check.names=FALSE) + .out + }) + ans <- data.table::rbindlist(ans, use.names=TRUE, fill=TRUE) + ##################### + #think about + ##################### + # adding x_[profile] (m_[profile] * profile) calculations here + # currently done on fly in some plots... + ans$.value <- ans$test + mod2 <- lm(pred~.value, data=ans) + ans$adj.r.sq <- summary(mod2)$adj.r.squared + ans$slope <- summary(mod2)$coefficients[2,1] + ans$p.slope <- summary(mod2)$coefficients[2,4] + ans$intercept <- summary(mod2)$coefficients[1,1] + ans$p.intercept <- summary(mod2)$coefficients[1,4] + ########### + #(also noted in sp_pls_profile) + #if we need to calculate it on a case-by-case basis... + #need to read this: + #https://stackoverflow.com/questions/39999456/aic-on-nls-on-r + #see stats:::logLik.nls for AIC calc... + ans$AIC <- AIC(mod2) + + as.data.frame(ans) +} + + + +########################### +########################### +## pls_refit_species +########################### +########################### + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +# like to drop power from formals +# maybe ignore or pass overwrites via ...? + +# need to update the model handling so it is like sp_pls_profile +# this would sort power issue above +# also means the user can change setting themselves +# THINK ABOUT THIS +# they could make a pls that was not positively constrained + + + +pls_refit_species <- function(pls, name, power=1, + ...){ + .xx <- pls_report(pls) + #name might want to be case-non-sensitive at some point + #think about how to do this one... + .data <- .xx[.xx$SPECIES_NAME==name,] + #get and hold all the m_ values + #update profile contributions for named species + .ms <- names(.data)[grepl("^m_", names(.xx))] + .xs <- gsub("^m_", "", .ms) + .for <- paste("(`", .ms, "`*`", .xs, "`)", + sep="", collapse = "+") + .for <- as.formula(paste("test~", .for)) + .da <- .data[!names(.data) %in% .xs] + + + .ls <- lapply(.xs, function(x){0}) + names(.ls) <- .xs + + ################# + #user might want to set this??? + + .ls2 <- lapply(.xs, function(x){.data[1, x]}) + names(.ls2) <- .xs + + mod <- nls(.for, data=.da, + #weights = 1/(.out$test^push), # think about weighting + start=.ls2, lower=.ls, + algorithm="port", + control=nls.control(tol=1e-5) #think about tolerance + ) + + .data[.xs] <- data.frame(t(coefficients(mod))) + + #lazy + .ans <- .data + + for(i in .ans$PROFILE_CODE){ + .ii <- subset(.ans, PROFILE_CODE==i) + .ii <- .ii[names(.ii) %in% names(pls[[i]]$args$data)] + .sp.ord <- unique(pls[[i]]$args$data$SPECIES_ID) + pls[[i]]$args$data <- subset(pls[[i]]$args$data, SPECIES_NAME!=name) + pls[[i]]$args$data <- rbind(pls[[i]]$args$data, .ii) + #put back in right order + pls[[i]]$args$data <- + pls[[i]]$args$data[order(ordered(pls[[i]]$args$data$SPECIES_ID, + levels=.sp.ord)),] + #rebuild model + .for <- as.character(formula(pls[[i]]$mod)) + .for <- as.formula(paste(.for[2], .for[1], .for[3], sep="")) + .ms <- names(pls[[i]]$args$data) + .ms <- .ms[!.ms %in% c("SPECIES_ID", "SPECIES_NAME", "test")] + .ls <- lapply(.ms, function(x){0}) + names(.ls) <- paste("m_", .ms, sep="") + .da <- pls[[i]]$args$data + + pls[[i]]$mod <- nls(.for, data=.da, + weights = (1/.da$test)^power, # think about weighting + start=.ls, lower=.ls, + algorithm="port", + control=nls.control(tol=1e-5, + warnOnly = TRUE) #think about tolerance + ) + } + + pls + +} + + + + +#################################### +#################################### +## pls_fit_parent +#################################### +#################################### + + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +############################# +#this needs a lot of work +############################# + + +# (like pls_refit_species) +# like to drop power from formals +# maybe ignore or pass overwrites via ...? + +# need to update the model handling so it is like sp_pls_profile +# this would sort power issue above +# also means the user can change setting themselves +# THINK ABOUT THIS +# they could make a pls that was not positively constrained +# this would also remove the start, lower and upper options +# from the formals... + +# parent could already be in x +# then parent could just be the name of parent??? + +# also a case for using this to add a non-parent to x +# e.g. pls_fit_unknown_species... +# to fit a species to the existing model as a source apportion of +# that unknown... +# in which case maybe this should just be a wrapper for that +# with the start, lower and upper like below + +# if we are setting start and lower +# start = lower if start is missing might be safer... + + +pls_fit_parent <- function(pls, parent, power=1, + start=100, + lower=50, upper=200, ...){ + + .out <- pls_report(pls) + #parent should only have one species + #and have same profiles as pls model data + #and its contribution to all sources is set by .value + .out <- subset(.out, SPECIES_ID == unique(.out$SPECIES_ID)[1]) + .test <- c("PROFILE_CODE", ".value", "WEIGHT_PERCENT") + .test <- names(parent)[names(parent) %in% .test] + .data <- parent[.test] + names(.data)[2] <- "parent" + .data <- merge(.out, .data[c(1:2)]) + + #formula + .ms <- names(.data)[grepl("^m_", names(.out))] + .for <- paste("(`", .ms, "`*`", gsub("^m_", "n_", .ms), "`)", + sep="", collapse = "+") + .for <- as.formula(paste("parent~", .for)) + + .ns <- .ms + names(.ns) <- gsub("^m_", "n_", .ms) + .ls <- lapply(.ns, function(x){start}) + .ls2 <- lapply(.ns, function(x){lower}) + .ls3 <- lapply(.ns, function(x){upper}) + + mod <- nls(.for, data=.data, + #weights = (1/.out$test)^power, # think about weighting + start=.ls, + lower=.ls2, + upper=.ls3, + algorithm="port", + control=nls.control(tol=1e-5) #think about tolerance + ) + .ans <- data.frame( + PROFILE_CODE = .data$PROFILE_CODE, + SPECIES_ID = parent$SPECIES_ID[1], + SPECIES_NAME = parent$SPECIES_NAME[1], + t(coefficients(mod)), + test = .data$parent + ) + names(.ans) <- gsub("^n_", "", names(.ans)) + for(i in .ans$PROFILE_CODE){ + .ii <- subset(.ans, PROFILE_CODE==i) + .ii <- .ii[names(.ii) != "PROFILE_CODE"] + pls[[i]]$args$data <- + rbind(pls[[i]]$args$data, .ii) + #rebuild model + .for <- as.character(formula(pls[[i]]$mod)) + .for <- as.formula(paste(.for[2], .for[1], .for[3], sep="")) + .ms <- names(pls[[i]]$args$data) + .ms <- .ms[!.ms %in% c("SPECIES_ID", "SPECIES_NAME", "test")] + .ls <- lapply(.ms, function(x){0}) + names(.ls) <- paste("m_", .ms, sep="") + .da <- pls[[i]]$args$data + + pls[[i]]$mod <- nls(.for, data=.da, + weights = (1/.da$test)^power, # think about weighting + start=.ls, lower=.ls, + algorithm="port", + control=nls.control(tol=1e-5) #think about tolerance + ) + } + + pls + +} + + +#################################### +################################### +## pls_plots +################################### +################################### + +#these are all draft + + +#################################### +#################################### +## pls_plot +#################################### +#################################### + + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +############################# +#this needs a lot of work +############################# + +# this uses unexported rsp_profile_pie function below... +# both pls_plot and rsp_profile_pie need work... + + +pls_plot <- function(pls, species=1, type=1,...){ + + #main summary plot + #require(data.table) + + #to do + ############################ + + #general + + #tidy code; + # this was put together fast... + # lots of code can be tidied/simplified... + + #needs external plot control... + # arg passing to barplot in ...??? + # at least col control??? + + #type 1 + ############################ + #like x axes to look like a conventionally numeric + #like option for x axes to be mapped onto another + # measurement + # currently an index based on PROFILE_CODE + # maybe as.x or map.x, etc... + + + #type =2 for pie plot... + ############################# + #using modification of pie + # unexported/below + # see notes there... + + + + dat <- pls_report(pls) + .sp.ref <- unique(dat$SPECIES_NAME) + if(missing(species)){ + species <- .sp.ref[1] + } + if(is.numeric(species)){ + if(all(species == -1)){ + species <- .sp.ref + } else { + species <- .sp.ref[species] + } + } + + .sp.ord <- unique(dat$SPECIES_ID) #only need this if I + .sp.m.pro <- names(dat)[grep("^m_", names(dat))] + .sp.pro <- gsub("^m_", "", .sp.m.pro) + + #could put this is pls_report??? + for(i in .sp.pro){ + dat[, paste("x_", i, sep="")] <- dat[, paste("m_", i, sep="")] * dat[,i] + } + + .sp.x.pro <- names(dat)[grep("^x_", names(dat))] + + .rep <- dat[c("SPECIES_NAME", "SPECIES_ID", "PROFILE_CODE", .sp.x.pro)] + .rep <- melt(as.data.table(.rep), + id=c("SPECIES_ID", "SPECIES_NAME", "PROFILE_CODE")) + + .tot <- as.data.table(dat) + .cs <- c(".value", "pred", .sp.x.pro) + .tot <- .tot[, lapply(.SD, function(x) sum(x, na.rm=TRUE)), + .SDcols= .cs, by=c("SPECIES_ID", "SPECIES_NAME")] + + for(i in species){ + + .rep2 <- as.data.frame(subset(.rep, SPECIES_NAME==i)) + .rep2$.index <- as.character(.rep2$PROFILE_CODE) + .tot2 <- as.data.frame(subset(.tot, SPECIES_NAME==i)) + + if(1 %in% type){ + .rep2$.index <- as.numeric(ordered(.rep2$.index, levels=unique(.rep2$.index))) + .rep2$.index <- ordered(.rep2$.index, levels=unique(.rep2$.index)) + #.cols <- palette.colors(n = length(.sp.x.pro), palette = "Okabe-Ito", + # recycle = FALSE) + .cols <- heat.colors(n=length(.sp.x.pro)) + .ncol <- ceiling(length(.sp.x.pro)/3) + .leg.text <- gsub("^x_", "", .sp.x.pro) + .bar <- barplot(value~variable + .index, .rep2, col=.cols, + main = i, ylab="Measurement", + xlab="Sample [index]", + legend.text=.leg.text, + args.legend=list(cex=0.8, + ncol=.ncol)) + .dat <- subset(dat, SPECIES_NAME==i) + lines(.bar, .dat$.value) + } + if(2 %in% type){ + .cols <- heat.colors(n=length(.sp.x.pro)) + .vals <- unlist((.tot2[,.sp.x.pro]/.tot2$.value) * 100) + + .labs=paste(gsub("x_", "", .sp.x.pro), "\n", signif(.vals, 3) , "%", sep="") + print(sum(.vals)) + rsp_profile_pie(.vals, col=.cols, edges=1000, + labels=.labs, main=i) + } + + } + #output + #not sure if dat, .rep or output is most useful?? + invisible(.rep) +} + + +#################################### +#################################### +## pls_plot_species +#################################### +#################################### + + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +############################# +#this needs a lot of work +############################# + + +pls_plot_species <- function(pls, species, type=1, ...){ + + #general + + #tidy code; + # this was put together fast... + # lots of code can be tidied/simplified... + + #needs external plot control... + # arg passing to barplot in ...??? + # at least col control??? + + #type 1 + ############################ + + #various pls model plots for species + dat <- pls_report(pls) + .sp.ref <- unique(dat$SPECIES_NAME) + if(missing(species)){ + species <- .sp.ref[1] + } + if(is.numeric(species)){ + if(all(species == -1)){ + species <- .sp.ref + } else { + species <- .sp.ref[species] + } + } + for(i in species){ + d2 <- subset(dat, SPECIES_NAME==i) + lims <- range(c(d2$.value, d2$pred)) + mod <- lm(pred~0+.value, d2) + .sum <- paste("y = ", signif(summary(mod)$coefficients[1,1], 3), + "x (adj.R2 = ", signif(summary(mod)$adj.r.squared, 3), + ")", sep="") + if(1 %in% type){ + plot(d2$.value, d2$pred, type="n", + main=i, + xlab="Measurement", + ylab="Model", + xlim=lims, ylim=lims) + grid() + abline(a=0, b=1, col="grey") + points(d2$.value, d2$pred) + abline(mod, col="red", lty=2) + text(lims[1], lims[2], .sum, adj=c(0,1), cex=0.75) + } + if(2 %in% type){ + plot(d2$.value, type="n", + main=i, + ylab="Measurement", + xlab="Sample [index]", + ylim=lims) + lines(d2$.value) + lines(d2$pred, col="red") + } + } + invisible(NULL) +} + + + +#################################### +#################################### +## pls_plot_profile +#################################### +#################################### + + +#' @rdname sp.pls +#' @export + +## now imports from xxx.r +## #' @import data.table + +############################# +#this needs a lot of work +############################# + + +pls_plot_profile <- function(pls, profile, log=FALSE, ...){ + + #general + + #tidy code; + # this was put together fast... + # lots of code can be tidied/simplified... + + #needs external plot control... + # arg passing to barplot in ...??? + # at least col control??? + + #type + ############################ + #(currently only one so not needed...) + # + + #log logical ONLY + ############################ + # very fiddly... + # + + #require(data.table) + + dat <- pls_report(pls) + #need to reset species order after calculating stats + # think about making first half of this function a dedicated function?? + + #need to think this through + # there is a lot here that is very painful... + + #could pass plot commands via the ... args + + .sp.ord <- unique(dat$SPECIES_ID) + .sp.m.pro <- names(dat)[grep("^m_", names(dat))] + .sp.pro <- gsub("^m_", "", .sp.m.pro) + if(missing(profile)){ + profile <- .sp.pro[1] + } + if(is.numeric(profile)){ + if(all(profile == -1)){ + profile <- .sp.pro + } else { + profile <- .sp.pro[profile] + } + } + m_profile <- paste("m_", profile, sep="") + + #just profiles asked for + dat <- dat[c("SPECIES_ID", "SPECIES_NAME", "PROFILE_CODE", + profile, m_profile, "pred", ".value")] + #calculate percentage of species + for(i in profile){ + dat[, paste("x_", i, sep="")] <- dat[, paste("m_", i, sep="")] * dat[,i] + } + .rep <- data.table::as.data.table(dat) + .cols <- c(".value", "pred", paste("x_", profile, sep="")) + .rep <- .rep[, lapply(.SD, function(x) sum(x, na.rm=TRUE)), + .SDcols= .cols, by=c("SPECIES_ID", "SPECIES_NAME")] + .rep <- as.data.frame(.rep) + for(i in profile){ + .rep[, paste("pc_", i, sep="")] <- (.rep[, paste("x_", i, sep="")] / .rep$pred) * 100 + } + + dat <- dat[dat$PROFILE_CODE==unique(dat$PROFILE_CODE)[1],] + dat <- merge(.rep, dat[c("SPECIES_ID", "SPECIES_NAME", "PROFILE_CODE", profile)],) + dat <- dat[order(ordered(dat$SPECIES_ID, levels=.sp.ord)), ] + rownames(dat) <- 1:nrow(dat) + + + for(i in profile){ + d2 <- dat[c("SPECIES_ID", "SPECIES_NAME", "PROFILE_CODE", + i, paste("pc_", i, sep=""))] + + if(log){ + #plotting first axis as logs + .f.cex = 0.8 + .x <- d2$SPECIES_NAME + .m.x <- max(nchar(.x), na.rm=T)/1.3*.f.cex + .y <- d2[,i] + .min <- min(.y[.y>0], na.rm=TRUE)/2 + .y[.y<.min] <- .min + .y[is.na(.y)] <- .min + .y <- log10(.y) + .y1.at <- pretty(.y) + .y1.at <- .y1.at[.y1.at == round(.y1.at)] + #need to go back in and recalculate min as + #bottom of pretty... + .y[.y==log10(.min)] <- .y1.at[1] + + #.y1.lb <- as.character(10^(.y1.at)) + .y1.lb <- c(format(10^(.y1.at), drop0trailing = TRUE, scientific=FALSE)) + .y <- .y - min(.y1.at) + .y1.at <- .y1.at - min(.y1.at) + + .yl <- "Source Loading" + .y2 <- d2[, paste("pc_", i, sep="")] + .y2 <- ((.y2/100) * (max(.y1.at) - min(.y1.at))) + min(.y1.at) + .y2.lb <- seq(0, 100, by=20) + .y2.at <- ((.y2.lb/100) * (max(.y1.at) - min(.y1.at))) + min(.y1.at) + + } else { + #plotting first axis as is + .f.cex = 0.8 + .x <- d2$SPECIES_NAME + .m.x <- max(nchar(.x), na.rm=T)/1.3*.f.cex + .y <- d2[,i] + .y1.at <- pretty(.y) + .y1.lb <- .y1.at + .yl <- "Source Loading" + .y2 <- d2[, paste("pc_", i, sep="")] + .y2 <- ((.y2/100) * (max(.y1.at) - min(.y1.at))) + min(.y1.at) + .y2.lb <- seq(0, 100, by=20) + .y2.at <- ((.y2.lb/100) * (max(.y1.at) - min(.y1.at))) + min(.y1.at) + + } + + #lims <- range(d2[,4], na.rm=TRUE) + #.min <- min(d2[,4][d2[,4]>0], na.rm=TRUE) + + .old.par <- par(no.readonly = TRUE) + par(mar = c(.m.x, 4, 4, 4) + 0.3) + .bar <- barplot(.y, + names.arg=.x, + ylim = range(.y1.at), + main=i, + cex.lab=.f.cex, + cex.names=.f.cex, + axes=FALSE, + ylab=.yl, + las=2 + ) + points(.bar, .y2, col="red", pch=19) + axis(2, ylim=range(.y1.at), at=.y1.at, labels=.y1.lb, cex=.f.cex, + cex.axis=.f.cex, las=2) + axis(4, ylim=range(0, max(.y2.at)), at=.y2.at, labels=.y2.lb, + col="red", col.axis="red", cex=.f.cex, + cex.axis=.f.cex, las=2) + mtext("Percent of Total Contribution (%)", side=4, line=3, col ="red", + cex=.f.cex) + par(.old.par) + + + } + invisible(dat) +} + + + + + + + + + + + + + +################ +################ +## unexported +################ +################ + + +#think about +####################################### +# printing amount missing as a segment +# adding plot arg control like in plot.respeciate +# adding args to change the displacement of labels + +rsp_profile_pie <- function (x, labels = names(x), edges = 200, radius = 0.8, + clockwise = FALSE, + init.angle = if (clockwise) 90 else 0, + density = NULL, angle = 45, col = NULL, + border = NULL, lty = NULL, main = NULL, ...) +{ + #this is graphics::pie with a couple of modifications... + #many thanks to... + #R Core Team (2023). _R: A Language and Environment for Statistical Computing_. R Foundation + #for Statistical Computing, Vienna, Austria. . + + if (!is.numeric(x) || any(is.na(x) | x < 0)) + stop("'x' values must be positive.") + if (is.null(labels)) + labels <- as.character(seq_along(x)) + else labels <- as.graphicsAnnot(labels) + + #added to remove any source with a zero contribution + if(any(x==0)){ + labels <- labels[x!=0] + x <- x[x!=0] + } + my.tot <- sum(x) + if(my.tot < 99){ + x <- c(x, 100-my.tot) + labels <- c(labels, "[hide]") + col <- c(col, NA) + init.angle <- init.angle + (((100-my.tot)/200)*360) + } + + x <- c(0, cumsum(x)/sum(x)) + dx <- diff(x) + nx <- length(dx) + plot.new() + pin <- par("pin") + xlim <- ylim <- c(-1, 1) + if (pin[1L] > pin[2L]) + xlim <- (pin[1L]/pin[2L]) * xlim + else ylim <- (pin[2L]/pin[1L]) * ylim + dev.hold() + on.exit(dev.flush()) + plot.window(xlim, ylim, "", asp = 1) + if (is.null(col)) + col <- if (is.null(density)) + c("white", "lightblue", "mistyrose", "lightcyan", + "lavender", "cornsilk") + else par("fg") + if (!is.null(col)) + col <- rep_len(col, nx) + if (!is.null(border)) + border <- rep_len(border, nx) + if (!is.null(lty)) + lty <- rep_len(lty, nx) + angle <- rep(angle, nx) + if (!is.null(density)) + density <- rep_len(density, nx) + twopi <- if (clockwise) + -2 * pi + else 2 * pi + t2xy <- function(t) { + t2p <- twopi * t + init.angle * pi/180 + list(x = radius * cos(t2p), y = radius * sin(t2p)) + } + for (i in 1L:nx) { + + if(!as.character(labels[i]) == "[hide]"){ + n <- max(2, floor(edges * dx[i])) + P <- t2xy(seq.int(x[i], x[i + 1], length.out = n)) + #changed shape to include hole + polygon(c(P$x, rev(P$x*0.5)), c(P$y, rev(P$y*0.5)), density = density[i], angle = angle[i], + border = border[i], col = col[i], lty = lty[i]) + P <- t2xy(mean(x[i + 0:1])) + lab <- as.character(labels[i]) + if (!is.na(lab) && nzchar(lab)) { + # 1.1 and 1.2 are the extenders when moving labels way from + # the pie plot itself + lines(c(1, 1.1) * P$x, c(1, 1.1) * P$y) + text(1.2 * P$x, 1.2 * P$y, labels[i], xpd = TRUE, + adj = ifelse(P$x < 0, 1, 0), ...) + } + } + } + + text(0,0, label=paste("sum\n",signif(my.tot, 3), "%", sep="")) + title(main = main, ...) + invisible(NULL) +} + + diff --git a/R/sp.rescale.R b/R/sp.rescale.R new file mode 100644 index 0000000..f617c4e --- /dev/null +++ b/R/sp.rescale.R @@ -0,0 +1,330 @@ +#' @name sp.rescale +#' @title (re)SPECIATE profile rescaling functions +#' @aliases sp_rescale sp_rescale_profile sp_rescale_species + +#' @description Functions for rescaling + +#' @description \code{sp_rescale} rescales the percentage weight records in +#' a supplied (re)SPECIATE profile data set. This can be by profile or species +#' subsets, and \code{sp_rescale_profile} and \code{sp_rescale_species} provide +#' short-cuts to these options. +#' @param x A \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param method numeric, the rescaling method to apply: +#' 1 \code{x/total(x)}; +#' 2 \code{x/mean(x)}; +#' 3 \code{x-min(x)/max(x)-min(x)}; +#' 4 \code{x-mean(x)/sd(x)}; +#' 5 \code{x/max(x)}. +#' The alternative 0 returns the records to their original +#' values. +#' @param by character, when rescaling \code{x} with +#' \code{\link{sp_rescale}}, the data type to group and rescale, +#' currently \code{'species'} (default) or \code{'profile'}. +#' @return \code{sp_rescale} and \code{sp_rescale} return the +#' \code{respeciate} profile with the percentage weight records rescaled using +#' the requested method. See Note. +#' @note Data sometimes needs to be normalised, e.g. when applying some +#' statistical analyses. Rather than modify the EPA records in the +#' \code{WEIGHT_PERCENT} column, \code{respeciate} creates a duplicate column +#' \code{.value} which is modified by operations like \code{sp_rescale_profile} +#' and \code{sp_rescale_species}. This means rescaling is always applied to +#' the source information, rather than rescaling an already rescaled value, +#' and the EPA records are retained unaffected. So, the original source +#' information can be easily recovered. +#' @references +#' Dowle M, Srinivasan A (2023). data.table: Extension of `data.frame`. +#' R package version 1.14.8, \url{https://CRAN.R-project.org/package=data.table}. + +#NOTE + +#' @rdname sp.rescale +#' @export +## #' @import data.table +# now done in xxx.r + +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + + +###################### +#rescale by profile +###################### + +########################## +#issues +############################ + +#think about... + +# maybe rethink options for sp_rescale() when user sets by but NOT method... + +# not actually sure anyone would want anything but method 1 for profiles... + + + +sp_rescale <- function(x, method = 2, by = "species"){ + + ################################# + #check x is a respeciate object?? + + #check it has .value + x <- rsp_tidy_profile(x) + + #save to return as is.. + # thinking about this + tmp <- class(x) + + #was testing/thinking about + ################################# + #backdoor + ################################# + #could save species and profile identifiers, and + #weight_percent then overwrite weight_percent with .value here + #then copy back weight_percent after calculations + # then you could do multiple rescales without touching + # weight_percent??? + + + ################################# + #calculate stats + xx <- as.data.table(x) + #remove stats if there... + test <- c(".min",".max",".total", ".mean", ".na", ".n", ".sd") + test <- test[ test %in% colnames(xx)] + if(length(test)>0){ + xx <- xx[, (test) := NULL] + } + + #and recalculate + + #stop if by option not known. + if(!by %in% c("species", "profile")){ + stop("unknown 'by' option", + call.= FALSE) + } + + if(by == "profile"){ + out <- xx[, + .(#SPECIES_NAME = SPECIES_NAME[1], + #SPEC_MW = SPEC_MW[1], + .min = min(WEIGHT_PERCENT, na.rm = TRUE), + .max = max(WEIGHT_PERCENT, na.rm = TRUE), + .total = sum(WEIGHT_PERCENT, na.rm = TRUE), + .mean = mean(WEIGHT_PERCENT, na.rm = TRUE), + .na = length(WEIGHT_PERCENT[is.na(WEIGHT_PERCENT)]), + .n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)]), + .sd = sd(WEIGHT_PERCENT, na.rm = TRUE) + ), + by=.(PROFILE_CODE)] + + out <- merge(xx, out, by="PROFILE_CODE", all.x=TRUE, all.y=FALSE, + allow.cartesian=TRUE) + } + if(by == "species"){ + out <- xx[, + .(.min = min(WEIGHT_PERCENT, na.rm = TRUE), + .max = max(WEIGHT_PERCENT, na.rm = TRUE), + .total = sum(WEIGHT_PERCENT, na.rm = TRUE), + .mean = mean(WEIGHT_PERCENT, na.rm = TRUE), + .na = length(WEIGHT_PERCENT[is.na(WEIGHT_PERCENT)]), + .n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)]), + .sd = sd(WEIGHT_PERCENT, na.rm = TRUE) + ), + by=.(SPECIES_ID)] + + out <- merge(xx, out, by="SPECIES_ID", all.x=TRUE, all.y=FALSE, + allow.cartesian=TRUE) + } + + + ################# + #need to decide how to handle this + ################### + #this is a little messy + + #rather leave weight-percent untouched + # make a local .value column if not there + # then have rescale reset that + # using weight-percent as source + + #not sure of a best plan for method arg + # how to handling...? + # how much user control...? + + #other options + # any more??? + + #might need an option to get all WEIGHT_PERCENT back from database... + + if(!method %in% c(0:5)){ + stop("unknown method") + } + if(method==0){ + out$.value<- out$WEIGHT_PERCENT + } + if(method==1){ + out$.value<- out$WEIGHT_PERCENT / out$.total + } + if(method==2){ + out$.value<- out$WEIGHT_PERCENT /out$.mean + } + if(method==3){ + #might be an issue if only one value + out$.value <- (out$WEIGHT_PERCENT - out$.min) / (out$.max - out$.min) + } + if(method==4){ + #might be an issue if only one value + out$.value <- (out$WEIGHT_PERCENT - out$.mean) / out$.sd + } + if(method==5){ + out$.value<- out$WEIGHT_PERCENT / out$.max + } + + ############################ + #backdoor end + ############################ + #this would need careful thinking about + # handling for method = 0 might be an issue + # maybe a sp_reset() would be better + # remove weight_percent and apply sp_pad(out, "weight")? + + #output + # + out <- as.data.frame(out) + class(out) <- tmp + out +} + + +#' @rdname sp.rescale +#' @export + +sp_rescale_profile <- function(x, method = 1, by ="profile"){ + sp_rescale(x=x, method=method, by=by) +} + +#' @rdname sp.rescale +#' @export + +sp_rescale_species <- function(x, method = 2, by ="species"){ + sp_rescale(x=x, method=method, by=by) +} + + + + +################### +#unexported +################## + + +###################### +#old.rescale data by species +###################### + +########################## +#issues +############################ + +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + +# may need to think about additional local scaling +# e.g. within in profile [species conc]/[sum of all species concs] + +rsp_rescale_species <- function(x, method = 2){ + + ################################# + #check x is a respeciate object?? + + #check it has .value + x <- rsp_tidy_profile(x) + + #save to return as is.. + # thinking about this + tmp <- class(x) + + ################################# + #calculate stats + xx <- as.data.table(x) + #remove stats if there... + test <- c(".min",".max",".total", ".mean", ".na", ".n", ".sd") + test <- test[ test %in% colnames(xx)] + if(length(test)>0){ + xx <- xx[, (test) := NULL] + } + #and recalculate + out <- xx[, + .(#SPECIES_NAME = SPECIES_NAME[1], + #SPEC_MW = SPEC_MW[1], + .min = min(WEIGHT_PERCENT, na.rm = TRUE), + .max = max(WEIGHT_PERCENT, na.rm = TRUE), + .total = sum(WEIGHT_PERCENT, na.rm = TRUE), + .mean = mean(WEIGHT_PERCENT, na.rm = TRUE), + .na = length(WEIGHT_PERCENT[is.na(WEIGHT_PERCENT)]), + .n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)]), + .sd = sd(WEIGHT_PERCENT, na.rm = TRUE) + ), + by=.(SPECIES_ID)] + + out <- merge(xx, out, by="SPECIES_ID", all.x=TRUE, all.y=FALSE, + allow.cartesian=TRUE) + + ################# + #need to decide how to handle this + ################### + #this is a little messy + + #rather leave weight-percent untouched + # make a local .value column if not there + # then have rescale reset that + # using weight-percent as source + + #not sure of a best plan for method arg + # how to handling...? + # how much user control...? + + #other options + # no rescale + # x/sum(x) so all out of 1? + # that might want to only for unique profile codes??? + + if(!method %in% c(0:5)){ + stop("unknown method") + } + if(method==0){ + out$.value<- out$WEIGHT_PERCENT + } + if(method==1){ + out$.value<- out$WEIGHT_PERCENT / out$.total + } + if(method==2){ + out$.value<- out$WEIGHT_PERCENT /out$.mean + } + if(method==3){ + #might be an issue if only one value + out$.value <- (out$WEIGHT_PERCENT - out$.min) / (out$.max - out$.min) + } + if(method==4){ + #might be an issue if only one value + out$.value <- (out$WEIGHT_PERCENT - out$.mean) / out$.sd + } + if(method==5){ + out$.value<- out$WEIGHT_PERCENT / out$.max + } + + #output + # + out <- as.data.frame(out) + class(out) <- tmp + out +} + + + + + + + + diff --git a/R/sp.reshape.R b/R/sp.reshape.R new file mode 100644 index 0000000..4c4d371 --- /dev/null +++ b/R/sp.reshape.R @@ -0,0 +1,369 @@ +#' @name sp.reshape +#' @title (re)SPECIATE profile reshaping functions +#' @aliases sp_dcast sp_dcast_profile sp_dcast_species sp_melt_wide + +#' @description Functions for reshaping (re)SPECIATE profiles + +#' @description \code{sp_dcast} and \code{sp_melt_wide} reshape supplied +#' (re)SPECIATE profile(s). \code{sp_dcast} converts these from their supplied +#' long form to a widened form, \code{dcast}ing the data set by either species +#' or profiles depending on the \code{widen} setting applied. +#' \code{sp_dcast_profile} and \code{sp_dcast_species} are wrappers for these +#' options. \code{sp_melt_wide} attempts to return a previously widened data +#' set to the original long form. +#' @param x A \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles in standard long form or widened form for +#' \code{\link{sp_dcast}} and \code{\link{sp_melt_wide}}, respectively. +#' @param widen character, when widening \code{x} with +#' \code{\link{sp_dcast}}, the data type to \code{dcast}, +#' currently \code{'species'} (default) or \code{'profile'}. See Note. +#' @param pad logical or character, when \code{melt}ing a previously widened +#' data set, should output be re-populated with species and/or profile +#' meta-data, discarded when widening. This is currently handled by +#' \code{\link{sp_pad}}. The default \code{TRUE} applies standard settings, +#' so does not include profile sources reference meta-data. (See +#' \code{\link{sp_pad}} for other options). +#' @param drop.nas logical, when \code{melt}ing a previously widened +#' data set, should output be stripped of any rows containing empty +#' weight/value columns. Because not all profile contains all species, the +#' \code{dcast}/\code{melt} process can generate empty rows, and this step +#' attempt account for that when working with standard re(SPECIATE) +#' profiles. It is, however, sometimes useful to check first, e.g. when +#' building profiles yourself. +#' @return \code{sp_dcast} returns the wide form of the supplied +#' \code{respeciate} profile. \code{sp_melt_wide} +#' returns the (standard) long form of a previously widened profile. + +#' @note Conventional long-to-wide reshaping of data, or \code{dcast}ing, can +#' be slow and memory inefficient. So, \code{respeciate} uses the +#' \code{\link[data.table:dcast]{data.table::dcast}} +#' method. The \code{sp_dcast_species} method, +#' applied using \code{widen='species'}, is effectively: +#' +#' \code{dcast(..., PROFILE_CODE+PROFILE_NAME~SPECIES_NAME, value.var="WEIGHT_PERCENT")} +#' +#' And, the alternative \code{widen='profile'}: +#' +#' \code{dcast(..., SPECIES_ID+SPECIES_NAME~PROFILE_CODE, value.var="WEIGHT_PERCENT")} +#' +#' Although, \code{respeciate} uses a local version of \code{WEIGHT_PERCENT} called +#' \code{.value}, so the EPA source information can easily be recovered. See also +#' \code{\link{sp_rescale_profile}}. +#' +#' @references +#' Dowle M, Srinivasan A (2023). _data.table: Extension of `data.frame`_. +#' R package version 1.14.8, . + +#NOTE + +#' @rdname sp.reshape +#' @export + +## now imports from xxx.r +## #' @import data.table + +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + +#in development sp_melt_wide below + +#maybe think about padding sp_dcast_profile + +###################### +#dcast +#long_to_wide reshape +###################### + +sp_dcast <- function(x, widen = "species"){ + + #################### + #see ?data.table::dcast for examples + #################### + + # currently running with any x + # but likely errors out with anything but a respeciate object... + + #note: should this handle non-respeciate objects? + # maybe not but with a force option to (at own risk) override?? + + #note: thinking about adding formal to set the wide term + # so user can set the dcast term, e.g. ~ species or profile + # name key? wide? variable.name? + + #adds .value if missing + ## using .value rather the WEIGHT_PERCENT in case rescaled + x <- rsp_tidy_profile(x) + + #save class + cls <- class(x) + + xx <- as.data.table(x) + + #stop if widen option not known. + if(!widen %in% c("species", "profile")){ + stop("unknown widen option") + } + if(widen=="species"){ + out <- dcast(xx, + PROFILE_CODE + PROFILE_NAME ~ SPECIES_NAME, + mean, + na.rm=TRUE, + value.var = ".value") + } + if(widen=="profile"){ + out <- dcast(xx, + SPECIES_ID + SPECIES_NAME ~PROFILE_CODE, + mean, + na.rm=TRUE, + value.var = ".value") + } + + #maybe use species_id in species dcast?? + ######################################## + + #then add matched species_names, making this unique?? + # ~species_name makes labeling easier but will be + # merging any duplicated species_names... + # could also add back in other profile info but not species info... + # which means we will loose that it you melt(dcast(...)) + + #output + # just outputting as data.frame at moment + # not sure if this can be respeciate and + # don't want a respeciate wide class + # could use an output arg??? as.is, data.frame, etc... + + out <- as.data.frame(out) + #class(out) <- cls + out +} + +#' @rdname sp.reshape +#' @export + +sp_dcast_profile <- function(x, widen = "profile"){ + sp_dcast(x=x, widen=widen) +} + + + +#' @rdname sp.reshape +#' @export + +sp_dcast_species <- function(x, widen = "species"){ + sp_dcast(x=x, widen=widen) +} + + + + +#' @rdname sp.reshape +#' @export + +## now imports from xxx.r +## #' @import data.table + +# may need to set data.table specifically?? +# data.table::as.data.table, etc?? + +# wanted melt to go with dcast +# curently just to reverse the dcast action +# see e.g. +# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-reshape.html + +#padding needs work +# + +###################### +#melt +#wide_to_long reshape +###################### + +sp_melt_wide <- function(x, pad = TRUE, drop.nas = TRUE){ + + #################### + #see ?data.table::melt for examples + #################### + + #adds .value if missing + ## using .value rather the WEIGHT_PERCENT in case rescaled + x <- rsp_tidy_profile(x) + + #save class + cls <- class(x) + + xx <- as.data.table(x) + + ################# + #test log/wide + #################### + #should be able to simplify this a lot + .test <- c("PROFILE_NAME", "PROFILE_CODE", "SPECIES_ID", "SPECIES_NAME", + "WEIGHT_PERCENT", ".value") + .test <- .test[.test %in% names(xx)] + if(length(.test)>2){ + stop("sp_melt_wide halted; x already looks like a long profile.", call.=FALSE) + } + .test.sp <- length(grep("PROFILE", .test)) + .test.pr <- length(grep("SPECIES", .test)) + if(.test.pr>0 & .test.sp>0){ + stop("sp_melt_wide halted; x already looks suspect.", call.=FALSE) + } + .long <- "bad" + if(.test.pr>0 & length(.test)==.test.pr){ + .id.vars <- .test + .long <- "PROFILE_CODE" + } + if(.test.sp>0 & length(.test)==.test.sp){ + .id.vars <- .test + .long <- "SPECIES_NAME" + } + if(.long=="bad"){ + stop("sp_melt_wide halted; x already looks suspect.", call.=FALSE) + } + + #should only be species.wide or profile.wide + # if we get to here + + out <- melt(xx, id.vars = .id.vars) + names(out)[names(out)=="variable"] <- .long + names(out)[names(out)=="value"] <- ".value" + + #out$WEIGHT_PERCENT <- out$.value + + #merge if padding + ##################### + #might not be best way of doing it + #testing sp_pad as an alternative to previous remarked code??? + # first need to standardise method, decide where to drop.nas, + # finalise formals, decide best data.table methods, etc + + if(is.logical(pad) && pad){ + pad <- "standard" + } + if(is.character(pad)){ + + out <- sp_pad(out) + +# PROFILES <- as.data.table(sysdata$PROFILES) +# SPECIES_PROPERTIES <- as.data.table(sysdata$SPECIES_PROPERTIES) +# if(.long=="PROFILE_CODE"){ +# out <- merge(out, PROFILES, by = .long, all.y=FALSE, +# all.x=TRUE, allow.cartesian=TRUE) +# .tmp <- intersect(names(out), names(SPECIES_PROPERTIES)) +# out <- merge(out, SPECIES_PROPERTIES, by = .tmp, all.y=FALSE, +# all.x=TRUE, allow.cartesian=TRUE) +# } else { +# #.long must be "SPECIES_NAME" +# out <- merge(out, SPECIES_PROPERTIES, by = .long, all.y=FALSE, +# all.x=TRUE, allow.cartesian=TRUE) +# .tmp <- intersect(names(out), names(PROFILES)) +# out <- merge(out, PROFILES, by = .tmp, all.y=FALSE, +# all.x=TRUE, allow.cartesian=TRUE) +# } +# #to get weight_percentage etc +# SPECIES <- as.data.table(sysdata$SPECIES) +# .tmp <- intersect(names(out), names(SPECIES)) +# print(.tmp) +# out <- merge(out, SPECIES, by = .tmp, all.y=FALSE, +# all.x=TRUE, allow.cartesian=TRUE) +# } else { +# #not great but... +# #if not padding WEIGHT_PERCENT has to be .value +# out$WEIGHT_PERCENT <- out$.value + } + #drop.nas... + if(drop.nas){ + if(".value" %in% names(out)){ + out <- out[!is.na(out$.value),] + } else { + if("WEIGHT_PERCENT" %in% names(out)){ + out <- out[!is.na(out$WEIGHT_PERCENT),] + } + #do we want to warn if nothing to strip + #if so, in else here?? + } + } + out <- as.data.frame(out) + + #need to rationalise outputs!!! + rsp_build_respeciate(out) + +} + + +######################## +#unexported, old code and text notes +####################### + +#test data 219 records +#aa <- sp_profile(sp_find_profile("ae6", by="profile_type")) + +#idiot-test reference. + +#I don't think I am using it anywhere... + +######################### +#testing getting rid of this +######################### + +#rsp_build_wide_profile <- function(x){ +# .usp <- unique(x$SPECIES_NAME) +# .upr <- unique(x$PROFILE_CODE) +# ref <- data.frame(t(rep(NA, length(.usp)))) +# names(ref) <- .usp +# ans <- lapply(.upr, function(.pr){ +# temp <- x[x$PROFILE_CODE==.pr,] +# out <- data.frame(t(temp$WEIGHT_PERCENT)) +# names(out) <- temp$SPECIES_NAME +# out<- modifyList(ref, out) +# out$PROFILE_CODE <- .pr +# out$PROFILE_NAME <- temp$PROFILE_NAME[1] +# out +# #ref +# }) +# do.call(rbind, ans) +#} + +#require(dplyr) + +#aa %>% group_by(PROFILE_CODE, SPECIES_ID) %>% +# summarise(PROFILE_NAME = PROFILE_NAME[1], +# SPECIES_NAME = SPECIES_NAME[1], +# SPEC_MW = SPEC_MW[1], +# total = sum(WEIGHT_PERCENT, na.rm=T), +# mean = mean(WEIGHT_PERCENT, na.rm=T), +# sd = sd(WEIGHT_PERCENT, na.rm=T), +# n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)])) + +#require(data.table) + +#test_wide <- function(x){ + ##as above + ##################### + ##see ?data.table::dcast for examples + ##################### + #xx <- as.data.table(x) + #out <- dcast(xx, + # PROFILE_CODE + PROFILE_NAME ~SPECIES_NAME, + # mean, + # na.rm=TRUE, + # value.var = "WEIGHT_PERCENT") + ##maybe use species id in dcast + ##then add matched species names, making unique?? + ##you can keep profile but not species info... + #as.data.frame(out) +#} + +#plot(test_wide(aa)$Iron, rsp_build_wide_profile(aa)$Iron) +## ~ 2 seconds + +#aa <- aa <- sp_profile(find_sp_profile("gas", by="profile_type")) +## 2641 profiles +## (full archive 6855) +#plot(test_wide(aa)$Formaldehyde) +## much faster than +#plot(rsp_build_wide_profile(aa)$Formaldehyde) +#plot(test_wide(aa)$Formaldehyde, rsp_build_wide_profile(aa)$Formaldehyde) +##straight y=x +# diff --git a/R/spq.R b/R/spq.R new file mode 100644 index 0000000..7be81c1 --- /dev/null +++ b/R/spq.R @@ -0,0 +1,89 @@ +#' @name spq +#' @title spq_ quick access to common re(SPECIATE) sub-samples +#' @aliases spq_gas spq_other spq_pm spq_pm.ae6 spq_pm.ae8 spq_pm.cr1 +#' spq_pm.simplified + +#' @description \code{spq_} functions are quick access wrappers to commonly +#' requested re(SPECIATE) sub-samples. +#' @return \code{spq_} functions typically return a \code{respeciate} +#' \code{data.frame} of the requested profiles. +#' +#' For example: +#' +#' \code{sqr_gas} returns all gaseous profiles (\code{PROFILE_TYPE == 'GAS'}). +#' +#' \code{sqr_pm} returns all particulate matter (PM) profiles not classified +#' as a special PM type (\code{PROFILE_TYPE == 'PM'}). +#' +#' The special PM types are subsets profiles intended for special +#' applications, and these include \code{sqr_pm.ae6} (type \code{PM-AE6}), +#' \code{sqr_pm.ae8} (type \code{PM-AE8}), \code{sqr_pm.cr1} (type +#' \code{PM-CR1}), \code{sqr_pm.simplified} (type \code{PM-Simplified}) +#' and \code{sqr_other} (\code{PROFILE_TYPE == 'OTHER'}). +#' + + +############################# +#NOTE +############################ + +#might not be keeping these + +#profile types +#GAS, OTHER, PM, PM-AE6 PM-AE8 PM-CR1 PM-Simplified + +#spq_gas, + + +#' @rdname spq +#' @export + +spq_gas <- function(){ + sp_profile(sp_profile_info("gas", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_other <- function(){ + sp_profile(sp_profile_info("other", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_pm <- function(){ + sp_profile(sp_profile_info("pm", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_pm.ae6 <- function(){ + sp_profile(sp_profile_info("pm-ae6", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_pm.ae8 <- function(){ + sp_profile(sp_profile_info("pm-ae8", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_pm.cr1 <- function(){ + sp_profile(sp_profile_info("pm-cr1", by = "profile_type", partial=FALSE)) +} + +#' @rdname spq +#' @export + +spq_pm.simplified <- function(){ + sp_profile(sp_profile_info("pm-simplified", by = "profile_type", partial=FALSE)) +} + + + + diff --git a/R/spx.R b/R/spx.R new file mode 100644 index 0000000..7b9a76c --- /dev/null +++ b/R/spx.R @@ -0,0 +1,262 @@ +#' @name spx +#' @title spx_ functions for grouping and subsetting +#' @aliases spx_ spx_copy spx_n_alkane spx_btex + + +#' @description \code{spx_} functions generate a vector of assignment +#' terms and can be used to subset or condition a supplied re(SPECIATE) +#' \code{data.frame}. +#' +#' Most commonly, the \code{spx_} functions accept a single input, a +#' re(SPECIATE) \code{data.frame} and return a logical vector of +#' length \code{nrow(x)}, identifying species of interest as +#' \code{TRUE}. So, for example, they can be used when +#' \code{\link{subset}}ting in the form: +#' +#' \code{subset(x, spx_n_alkane(x))} +#' +#' ... to extract just n-alkane records from a \code{respeciate} object +#' \code{x}. +#' +#' However, some accept additional arguments. For example, \code{spx_copy} +#' also accepts a reference data set, \code{ref}, and a column identifier, +#' \code{by}, and tests \code{x$by \%in\% unique(ref$by)}. +#' +#' @param x a \code{respeciate} object, a \code{data.frame} of re(SPECIATE) +#' profiles. +#' @param ref (\code{spx_copy} only) a second \code{respeciate} object, to +#' be used as reference when testing \code{x}. +#' @param by (\code{spx_copy} only) character, the name of the column +#' in \code{ref} to copy when testing \code{x}. +#' @return \code{spx_copy} outputs can be modified but, by default, it +#' identifies all species in the supplied reference data set. +#' +#' \code{spx_n_alkane} identifies C1 to C40 n-alkanes. +#' +#' \code{spx_btex} identifies the BTEX group of aromatic hydrocarbons +#' (benzene, toluene, ethyl benzene, and M-, O- and P-xylene). + +############################# +#NOTE +############################ + +#still not sure this is best way of doing this +# but it did not seem to be slowing things down +# and other approaches seems likely to get messy +# really quick... + +# others to do???? + +# the BTEXs - doing/testing + +# others to consider??? + +# PAHs different groups +# VOCs HC network. +# elementals??? +# monitoring network relevant subsets of species + +# special cases??? + +# spx_ref(x, ref, by="") +# where x is respeciate object, ref is a reference +# by is column in ref; case is x$by %in% unique(ref$by) + +# could ref also be a vector of terms??? + + +#' @rdname spx +#' @export + +spx_copy <- function(x, ref = NULL, by="species_id"){ + + #maybe warn??? + if(is.null(ref)){ + ref <- x + } + names(x) <- tolower(names(x)) + names(ref) <- tolower(names(ref)) + .tmp <- unique(ref[, by]) + x[, by] %in% .tmp + +} + +##################### +#spx_n_alkanes +####################### + + +#source names +# from https://en.wikipedia.org/wiki/List_of_straight-chain_alkanes +# (might be duplicates) +# (some not using standard names) + +# could try smiles, molecular formula, cas numbers??? +# should be one entry/species if they are unique??? + +#test +## a <- sysdata$SPECIES_PROPERTIES +## b <- subset(a, spx_n_alkane(a)) +## b[order(b$SPEC_MW),] + +#' @rdname spx +#' @export + +spx_n_alkane <- function(x){ + #group x by is/isn't n-alkane + tolower(x$SPECIES_NAME) %in% c("methane", #C1 + "ethane", + "propane", + "n-butane", + "n-pentane", #C5 + "n-hexane", + "n-heptane", + "n-octane", + "n-nonane", + "n-decane", #C10 + "n-undecane", + "n-dodecane", + "n-tridecane", + "n-tetradecane", "tetradecane", + "n-pentadecane", "pentadecane", + "n-hexadecane", "hexadecane", + "n-heptadecane", "heptadecane", + "n-octadecane", "octadecane", + "n-nonadecane", "nonadecane", + "n-icosane", "eicosane", #C20 + "n-heneicosane", + "n-docosane", + "n-tricosane", + "n-tetracosane", + "n-pentacosane", #C25 + "n-hexacosane", + "n-heptacosane", + "n-octacosane", + "n-nonacosane", + "n-triacontane", #C30 + "n-hentriacontane", + "n-dotriacontane", "dotriacontane", + "n-tritriacontane", + "n-tetratriacontane", + "n-pentatriacontane", #C35 + "n-hexatriacontane", + "n-heptatriacontane", + "n-octatriacontane", + "n-nonatriacontane", + "n-tetracontane" #C40 + ) +} + + + + + +##################### +#spx_n_btex +####################### + +# Benzene, toluene, ethylbenzene, 3 xylenes isomers + +#ref +#https://www.eea.europa.eu/help/glossary/eper-chemicals-glossary/benzene-toluene-ethylbenzene-xylenes-as-btex +# like better... + +# also have btx group which is same minus ethylbenzene... + +#notes +##################### + +#might need more thinking about + +# toluene is sometimes reported as a mixture of toulene and another compound +# maybe not separated by GC??? + +# ethyl benzene and xylenes might have several names +# subset(a, a$Molecular.Formula=="C8H10") + +# are xylenes in as 1,2-, 1,3- and 1,4-dimethyl benzene??? +# are spaces an issue??? + +# if several names for btex, might need to think about how to +# sample (CAS? etc), merge and compare??? + + + +#tests +######################### +## a <- sysdata$SPECIES_PROPERTIES +## b <- subset(a, spx_btex(a)) +## b[order(b$SPEC_MW),] + +## to do + +#' @rdname spx +#' @export + +spx_btex <- function(x){ + #group x by is/isn't btex + tolower(x$SPECIES_NAME) %in% c( + #to test/check... + "benzene", + "toluene", + "ethylbenzene", + "m-xylene", + "o-xylene", + "p-xylene" + ) +} + + + + + + +############################## +#unexported, old, test notes +############################## + +# other options seemed a lot of work... + +#spx_n_alkane <- function(x){ +# x <- spg_n_alkane(x) +# x <- x[x$spg_n_alkane,] +# x[names(x) != "spg_n_alkane"] +#} + + +#subset. <- function(x, subset, select, ...){ +# mf <- match.call(expand.dots = FALSE) +# m <- match(c("x", "subset", "select"), names(mf), 0L) +# mf <- mf[c(1L, m)] +# mf$drop.unused.levels <- TRUE +# .x <- as.character(deparse(mf$x)) +# .subset <- as.character(deparse(mf$subset)) +# .select <- as.character(deparse(mf$select)) +# .subs <- strsplit(.subset, "&")[[1]] +# for(i in 1:length(.subs)){ +# .i <- gsub(" ", "", .subs[i]) +# if(is.function(try(get(.i), silent=TRUE))){ +# .subs[i] <- paste(.i, "(", .x, ")", sep="") +# } +# } +# .subs <- paste(.subs, collapse = " & ", sep="") +# +# +#print(.subs) +#print(.select) +# .eq <- paste("subset(as.data.frame(", .x, +# "), subset = ", .subs, ")", +# sep="") +# print(.eq) +# #mf <- eval(mf, parent.frame()) +# rsp_build_respeciate( +# eval(parse(text=.eq), +# parent.frame())) +# +## cl <- sys.call() +## f <- get(as.character(cl[[1]]), mode="function", sys.frame(-1)) +## cl <- match.call(definition=f, call=cl) +## x<- as.list(cl)[-1] +## deparse(x$subset) +# +#} diff --git a/R/sysdata.R b/R/sysdata.R index bd7979b..d95abd8 100644 --- a/R/sysdata.R +++ b/R/sysdata.R @@ -3,20 +3,37 @@ ############################################ #' #' @name sysdata -#' @description splat_packed data [to be described] +#' @description the re(SPECIATE) sysdata, a local versions of SPECIATE, +#' the EPA's repository of organic gas and particulate matter (PM) speciation +#' profiles of air pollution sources. #' -#' @format A ( 5 long) 'list' object +#' @format A (long) list, containing: #' \describe{ -#' \item{PROFILES}{[to document]} -#' \item{SPECIES}{[to document]} -#' \item{SPECIES_PROPERTIES}{[to document]} -#' \item{PROFILE_REFERENCE}{[to document]} -#' \item{REFERENCES}{[to document]} +#' \item{PROFILES}{The main \code{data.frame} of profile-specific meta-data, +#' with one row per profile, key term \code{PROFILE_CODE}.} +#' \item{SPECIES}{The main \code{data.frame} of individual record meta-data, +#' with one row per species in each profile, key terms \code{PROFILE_CODE} +#' and \code{SPECIES_ID} linking \code{PROFILES} and +#' \code{SPECIES_PROPERTIES}.} +#' \item{SPECIES_PROPERTIES}{The main \code{data.frame} of species-specific +#' meta-data, with one row per species, key term \code{SPECIES_ID}.} +#' \item{PROFILE_REFERENCE}{The \code{data.frame} linking profile and +#' reference meta-data, one row per references per profile, key terms +#' \code{PROFILE_CODE} and \code{REF_Code}.} +#' \item{REFERENCES}{The main \code{data.frame} of references for profile +#' source meta-data, one row per reference, key term \code{REF_Code}.} +#' \item{And others}{Currently not documented.} #' } -globalVariables(c("sysdata", "PROFILE_CODE")) #don't like this... +################################## +################################## +## IMPORTANT +################################## +################################## -#' @source [to doc] +# this documentation needs doing + +#' @source https://www.epa.gov/air-emissions-modeling/speciate #' @references #' Simon, H., Beck, L., Bhave, P.V., Divita, F., Hsu, Y., Luecken, D., #' Mobley, J.D., Pouliot, G.A., Reff, A., Sarwar, G. and Strum, M., 2010. diff --git a/R/xxx.R b/R/xxx.R new file mode 100644 index 0000000..0de879a --- /dev/null +++ b/R/xxx.R @@ -0,0 +1,629 @@ +############################## +#setup code, misc code, +#testing code, etc +############################## + +#currently no hooks, etc... + +##################### +#to check +##################### + +# I think all build/check issues associate with +# xxx_test and its depends... +# (not keeping unless we can get it to work better) + + +utils::globalVariables(c("sysdata", + "PROFILE_CODE", "PROFILE_NAME", "PROFILE_TYPE", + "SPECIES_ID", "SPECIES_NAME", + "SPEC_MW", "WEIGHT_PERCENT", ".", ".value")) + +######################## +#to think about... +####################### + +# all @import here +# in case we have to move to data.table::as.data.table, etc... + +#' @import data.table + +# data.table used by: +# rsp_test_profile, +# sp_dcast_profile, and those that use dcast? +# sp_species_cor +# sp_profile_distance +# and others??? +# need to identify them + +#' @importFrom stats sd cophenetic cor cutree dist hclust heatmap AIC +#' as.formula coefficients formula lm nls nls.control predict +#' @importFrom utils modifyList head +#' @importFrom graphics axis barplot par legend lines rect text abline +#' grid mtext plot.new plot.window points polygon title +#' @importFrom grDevices cm.colors colorRampPalette as.graphicsAnnot +#' dev.flush dev.hold heat.colors + + + +#might be able to drop legend? +# check plot.respeciate + + +############################## +#common unexported +############################## + + +############################ +#unexported function to +#test is x respeciate + +##aa <- sp_profile(sp_find_profile("ae8", by="profile_type")) +##bb <- sp_find_profile("ethanol") +##bb <- sp_find_species("butane") + +##rsp_test_respeciate(aa, level=1) +## level 1 TRUE for respeciate only (needs to look like respeciate) +## level likewise but TRUE for respeciate, respeciate.ref and respeciate.spcs + +##might need to rethink this if I want to use it as is.respeciate method +## is only allows 1 argument. + +## could also test for .value + +rsp_test_respeciate <- function(x, level = 1, + silent = FALSE){ + test <- class(x) + out <- "bad" + test <- test[1]=="respeciate" + #order below matters + #maybe tidy?? + if(all(c("SPECIES_NAME", "SPECIES_ID") %in% names(x))){ + out <- "respeciate.species.ref" + } + if(all(c("PROFILE_NAME", "PROFILE_CODE") %in% names(x))){ + out <- "respeciate.profile.ref" + } + if(all(c("SPECIES_NAME", "SPECIES_ID", "PROFILE_NAME", "PROFILE_CODE", + "WEIGHT_PERCENT") %in% names(x))){ + out <- "respeciate" + } + if(!silent){ + if(test && out=="bad"){ + warning("suspect respeciate object!") + } + if(nrow(x)==0){ + warning("empty respeciate object!") + } + } + if(level==1){ + return(test) + } + return(out) +} + +####################### +#tidy profile + +#I am thinking of using a .value column as my value column +#then WEIGHT_PERCENT remains as EPA made it even if we rescale... + +## testing this idea at the moment +## make .value using rsp_tidy_profile +## enabled in plot.respeciate, sp_profile_rescale, sp_profile_dcast +## rsp_test_profile + +rsp_tidy_profile <- function(x){ + #.value is local version of weight + if(!".value" %in% names(x)){ + x$.value <- x$WEIGHT_PERCENT + } + x +} + +########################### +#tidy names + +#currently not exported +#quick code to tidy species names + +#currently used in plot.respeciate + +#note: not fully tested + + +rsp_tidy_species_name <- function(x){ + + #attempts shorten names by remove other versions + #names seem to be in format a (or b || c) + #where (guessing) a is main name and + # b and c are alternatives. + + #not fully tested, + # might still be more cases this dies on + + #gsub("[(].*","", x) failed if name a includes brackets + #example:#"(2-methylpropyl)benzene (or isobutylbenzene)" + + #sub("[(][^(]or+$", "", x) failed if b or c includes brackets + #also left space at end so needed sub("^\\s+|\\s+$", "", x) + + #sometimes it is "( or " + x <- gsub(" [(] or ", " (or ", x) + #next removes from last "(or" onwards + x <- gsub("[(]or .*","", x) + sub("^\\s+|\\s+$", "", x) +} + + +################################ +#rsp_test_profile + +#file:///C:/Users/trakradmin/Downloads/datatable.pdf + + +##rsp_test_profile(aa) + +rsp_test_profile <- function(x){ + + #set up .value if not there + x <- rsp_tidy_profile(x) + + ####################################### + #track and return original class? + # testing + ####################################### + tmp <- class(x) + xx <- as.data.table(x) + out <- xx[, + .(PROFILE_NAME = PROFILE_NAME[1], + SPECIES_NAME = SPECIES_NAME[1], + SPEC_MW = SPEC_MW[1], + .total = sum(.value, na.rm = TRUE), + .value = mean(.value, na.rm = TRUE), + .n = length(.value[!is.na(.value)]), + .sd = sd(.value, na.rm = TRUE) + ), + by=.(PROFILE_CODE, SPECIES_ID)] + + #might regret .value + out$WEIGHT_PERCENT <- out$.value + #note: I *think* this is fine as long as x is never permanently replaced + # with the rsp_test_profile without a warning + # might be cases where we want to change WEIGHT_PERCENT + + # change case making an average profile + + #output + #might regret this... + # but would like to leave user free to use e.g. dplyr rather than + # data.table if that is their preference + out <- as.data.frame(out) + class(out) <- tmp + out +} + +# now replaced with rsp_test_profile + +#rsp_test (old version) + +#base case idiot test + +#rsp_test <- function(x){ +# .prf <- unique(x$PROFILE_CODE) +# ans <- lapply(.prf, function(y){ +# temp <- subset(x, PROFILE_CODE==y) +# .spc <- unique(temp$SPECIES_ID) +# ans <- lapply(.spc, function(z){ +# temp2 <- subset(temp, SPECIES_ID==z) +# data.frame(PROFILE_CODE = y, +# PROFILE_NAME = temp2$PROFILE_NAME[1], +# SPECIES_ID = z, +# SPECIES_NAME = temp2$SPECIES_NAME[1], +# COUNT = length(temp2$WEIGHT_PERCENT[!is.na(temp2$WEIGHT_PERCENT)]), +# TOTAL = sum(temp2$WEIGHT_PERCENT[!is.na(temp2$WEIGHT_PERCENT)]), +# SPEC_MW = temp2$SPEC_MW[1], +# WEIGHT_PERCENT=mean(temp2$WEIGHT_PERCENT, na.rm=TRUE)) +# }) +# do.call(rbind, ans) +# }) +# do.call(rbind, ans) +#} + +#require(dplyr) +#test1 <- function(x){ +# x %>% +# group_by(PROFILE_CODE, SPECIES_ID) %>% +# summarise(PROFILE_NAME = PROFILE_NAME[1], +# SPECIES_NAME = SPECIES_NAME[1], +# SPEC_MW = SPEC_MW[1], +# total = sum(WEIGHT_PERCENT, na.rm=T), +# mean = mean(WEIGHT_PERCENT, na.rm=T), +# sd = sd(WEIGHT_PERCENT, na.rm=T), +# n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)])) +#} +#aa <- sp_profile(sp_find_profile("ae6", by="profile_type")$PROFILE_CODE) +#require(data.table) +#test2 <- function(x){ +# ####################################### +# #could track and return original class? +# ####################################### +# xx <- as.data.table(x) +# out <- xx[, +# .(PROFILE_NAME = PROFILE_NAME[1], +# SPECIES_NAME = SPECIES_NAME[1], +# SPEC_MW = SPEC_MW[1], +# .total = sum(WEIGHT_PERCENT, na.rm = TRUE), +# WEIGHT_PERCENT = mean(WEIGHT_PERCENT, na.rm = TRUE), +# .n = length(WEIGHT_PERCENT[!is.na(WEIGHT_PERCENT)]), +# .sd = sd(WEIGHT_PERCENT, na.rm = TRUE) +# ), +# by=.(PROFILE_CODE, SPECIES_ID)] +# #output +# #currently data.frame +# #could be respeciate +# as.data.frame(out) +#} + + + + + + + + + +##################### +#testing +##################### + +#playing + +#function(x, subset,...){ +# ans <- match.call() +# ans <- as.character(ans) +# return(ans) +#} + +#ggplot example +#require(ggplot2) +#ggplot() + geom_col(aes(y=SPECIES_NAME, x=WEIGHT_PERCENT), data=aa) + facet_grid(.~PROFILE_NAME) + + +############################ +#color key +############################ + +######################## +#using this in: +######################## + +#sp_species_cor + + +#started with: +#https://stackoverflow.com/questions/9314658/colorbar-from-custom-colorramppalette + +#removed dev.new + +#see also regarding adding to existing plot +#https://www.statmethods.net/advgraphs/layout.html + +# One figure in row 1 and two figures in row 2 +# row 1 is 1/3 the height of row 2 +# column 2 is 1/4 the width of the column 1 +##attach(mtcars) +##layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE), +## widths=c(3,1), heights=c(1,2)) +##hist(wt) +##hist(mpg) +##hist(disp) + +# Add boxplots to a scatterplot +##par(fig=c(0,0.8,0,0.8), new=TRUE) +##plot(mtcars$wt, mtcars$mpg, xlab="Car Weight", +## ylab="Miles Per Gallon") +##par(fig=c(0,0.8,0.55,1), new=TRUE) +##boxplot(mtcars$wt, horizontal=TRUE, axes=FALSE) +##par(fig=c(0.65,1,0,0.8),new=TRUE) +##boxplot(mtcars$mpg, axes=FALSE) +##mtext("Enhanced Scatterplot", side=3, outer=TRUE, line=-3) + +# Function to plot color bar + +#currently not exporting + +#notes +#change min and max or .min and .max +#move title into plot? +#add key.style? +#add key.position? +#how to set plot range when na is included verus excluded +# needs to be a function of min/max range not 0.5, 1.5, etc +# maybe 0.25 ish??? maybe tick or half tick range???? + +#key is the data to be color-scaled or its range +# currently just used for range + +#cols is col.palette to apply to key range +# the col ramp is messy... seems to need border +# that could be issue if transparency using... + +#x, y are key (proportional, 0 to 1) positions + +#na.col color of na's +#na.cex scaling for na box width +# do we need any other NA controls?? +# e.g. na.border if needed to be different to main border??? + +#axes to do +# not using at moment +# but would separate axes and border colours +# not sure it is wanted/needed... + +#ticks, nticks axes tick positions and number of ticks, respectively +# think about these + +#title still work in progress +# not nice version on type=1 + + +#have type = 1/2 for horizontal/vertical +# think about types 3 and 4 +# horizontal with text at top +# vertical with text on other side + +#think about exporting this once sorted??? +# or passing to another package/better home +# (not sure how that fits with package remit) + + + +rsp_col_key <- function(key, cols, x, y = NULL, + ticks, nticks, + na.col = "grey", na.cex = 0.25, + title = "", axes, bg, border, + type = 2, + ...){ + + #setup + op <- par(no.readonly = TRUE) + + if(missing(x)){ + #currently just doing this option + #like key.pos "top-left", key.style = 1 (horizontal, annotation below) + x <- 0.1 + } + if(is.null(y)){ + y <- 0.9 + } + .min <- min(key, na.rm=TRUE) + .max <- max(key, na.rm=TRUE) + if(missing(ticks)){ + ticks <- pretty(c(.min, .max), 3) + } + if(missing(nticks)){ + nticks <- length(ticks) + } + .na.width <- na.cex * (.max-.min) + if(missing(bg)){ + bg <- "white" + } + if(missing(border)){ + border <- "black" + } + scale <- (length(cols)-1)/(.max-.min) + +#print(.max-.min) +#print(.na.width) + #print(.min) + #print(.max) + + #key.style 1 + if(type==1){ + #horizontal, header before, annotation after + #margins + .mai <- c(0.1,0.1,0.1,0.1) + if(title ==""){ + #no title + .fig <- c(x-0.1, x+0.1, y-0.04, y+0.1) + .wdt <- 12 + } else { + #title + .fig <- c(x-0.1, x+0.1, y-0.12, y+0.1) + .wdt <- 15 + } + + if(is.na(na.col)){ + .brd <- c(.na.width, .na.width) + } else { + .brd <- c(.na.width, .na.width*2) + } + + #position col key + par(fig = .fig, mai = .mai, new=TRUE) + + #plot col key + + #region + plot(c(.min -.brd[2], .max+(.brd[1]*0.5)), c(-1, .wdt), + type='n', bty='n', xaxt='n', xlab='', + yaxt='n', ylab='', main="", font.main = 1) + #bg + border + rect(.min -.brd[2], -1, .max+(.brd[1]*0.5), .wdt, + col=bg, border=border) + #title + if(title !=""){ + text(.min+((.max-.min)/2)+(.na.width*0.75), 13, labels=title, col="black", cex=0.75) + } + #col scale + for (i in 1:(length(cols)-0)) { + x <- (i-1)/scale + .min + rect(x,5,x+1/scale,10,col=cols[i], border=NA) + } + #axes + lines(c(.min, .max), c(5,5), col="black") + for (i in ticks) { + lines(c(i,i), c(5,4),col="black") + } + #axes annotation + text(ticks, rep(2, length(ticks)), labels=ticks, + cex=0.75, adj=0.5) + #na block + if(!is.na(na.col)){ + rect(.min-(.na.width*0.5), 5,.min-(.na.width*1.5), 10, col=na.col, border="black") + text(.min-.na.width, 2, labels="NA", col="black", cex=0.75) + } + + } + + if(type==2){ + #horizontal, header before, annotation after + #margins + .mai <- c(0.1,0.1,0.1,0.1) + if(title ==""){ + #no title + .fig <- c(x-0.05, x+0.05, y-0.2, y+0.1) + .wdt <- 12 + } else { + #title + .fig <- c(x-0.05, x+0.05, y-0.2, y+0.1) + .wdt <- 15 + } + + if(is.na(na.col)){ + .brd <- c(.na.width, .na.width) + } else { + .brd <- c(.na.width, .na.width*2) + } + + #position col key + par(fig = .fig, mai = .mai, new=TRUE) + + #plot col key + + #region + plot(c(-1, .wdt), c(.min-.brd[2], .max+(.brd[1]*0.5)), + type='n', bty='n', xaxt='n', xlab='', + yaxt='n', ylab='', main="", font.main = 1) + #bg + border + rect(-1, .min-.brd[2], .wdt, .max+(.brd[1]*0.5), + col=bg, border=border) + #title + if(title !=""){ + text(.min+((.max-.min)/2)+(.na.width*0.75), 13, labels=title, + col="black", cex=0.75) + } + + #for (i in 1:(length(lut)-1)) { + # y = (i-1)/scale + min + # rect(0,y,10,y+1/scale, col=lut[i], border=NA) + #} + + #col scale + #################### + #note + #################### + #this needs work because rect needs colored border + #which kills transparent ranges... + #does that matter + for (i in 1:(length(cols))) { + y <- (i-1)/scale + .min + rect(5,y-(1/scale),10,y,col=cols[i], border=cols[i]) + } + #axes + lines(c(5,5), c(.min, .max), col="black") + for (i in ticks) { + lines(c(5,4), c(i,i), col="black") + } + #axes annotation + text(rep(2, length(ticks)), ticks, labels=ticks, + cex=0.75, adj=0.5) + #na block + if(!is.na(na.col)){ + rect(5, .min-(.na.width*0.5), 10, .min-(.na.width*1.5), col=na.col, border="black") + text(2, .min-.na.width, labels="NA", col="black", cex=0.75) + } + + } + + par(op) +} + +#plot(iris$Sepal.Length, iris$Sepal.Width) +#rsp_col_key(c(1,-1), colorRampPalette(c("light green", "yellow", "orange", "red"))(100), title="testing") + + + + + + + + +########################### +#diagnostic +########################## + +####################### +##rethink this!!!! +####################### + +#parking for now + + +#https://www.rpubs.com/NYang/ANLY530-IRIS + +#devtools::unload(); devtools::load_all() +#a <- xxx_test(); subset(a, a$.ref)$PROFILE_NAME + +#xxx_test <- function(){ +# +# .tmp <- sysdata$PROFILES +# .out <-as.data.table(sp_dcast_profile(sp_profile(.tmp$PROFILE_CODE))) +# .tmp <- .tmp[c("PROFILE_CODE", "Keywords")] +# .tmp$.ref <- grepl("wildfire", tolower(.tmp$Keywords)) | +# grepl("burning", tolower(.tmp$Keywords)) +# .out <- merge(.out, .tmp) +# .out <- .out[-1599,] +# #glfit<-glm(as.numeric(.out$.ref)~.out$`Organic carbon` + .out$Nitrate + +# # .out$Sulfate, family = 'binomial') +# glfit<-glm(as.numeric(.out$.ref)~.out$Calcium + .out$Lead + +# .out$Zinc +.out$Manganese, family = 'binomial') +# print(summary(glfit)) +# .out$.pred <- NA +# .out$.pred[as.numeric(names(predict(glfit)))] <- predict(glfit, type="response") +# .out +# #glfit +#} + + + +###################################### +#matching +###################################### + +#Camp Fire + +#pb (lead) 0.04 - 0.13 ug/m3 +#zinc 0.15 - 0.45 +#manganese 0.008 - 0.012 +#calcium 0.1-0.2 +#organic carbon 20-100 + +# + + + +######################################## +#pca +######################################## + +#see notes at +#https://www.ime.usp.br/~pavan/pdf/PCA-R-2013 +#on r pca methods + +#https://stats.stackexchange.com/questions/59213/how-to-compute-varimax-rotated-principal-components-in-r +#https://stats.stackexchange.com/questions/56089/using-varimax-rotated-pca-components-as-predictors-in-linear-regression +#https://stackoverflow.com/questions/34944179/doing-pca-with-varimax-rotation-in-r +#on varimax rotation diff --git a/README.Rmd b/README.Rmd index 1ac5a4c..43aeb7a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -51,7 +51,7 @@ Plotting a profile plot(p) ``` -... or multiple profiles using barplot syntax +... using barplot syntax ```{r fig.width=12, fig.height=5, fig.align="center", dpi=400} #using base barplot diff --git a/README.md b/README.md index 749a4a0..974d2a3 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,8 @@ library(respeciate) x <- sp_find_profile("Ethanol") x #> respeciate profile reference -#> 0291 1070 1071 1132 1149 1301 1302 1303 1304 1314 5473 5474 5475 5477 5478 5479 -#> 5481 5482 5483 5485 5486 5487 5489 5490 5491 5493 5494 5495 5496 5497 5498 5499 -#> 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 -#> 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 -#> 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 -#> 8200 8210 8215 8733 8736 8751 8751a 8752 8754 8755 8757 8758 8760 8761 8763 -#> 8764 8765 8766 8767 8768 ... -#> > 160 profiles [showing first 100] +#> 0291 1070 1071 1132 1149 1301 1302 1303 1304 1314 ... +#> > 160 profiles [showing first 10] ``` ## speciate @@ -49,7 +43,7 @@ plot(p) -… or multiple profiles using barplot syntax +… using barplot syntax ``` r #using base barplot diff --git a/docs/news/index.html b/docs/news/index.html index adccae4..c172a4d 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -17,7 +17,7 @@ respeciate - 0.2.1 + 0.2.2 @@ -47,6 +47,37 @@

Changelog

Source: NEWS.md +
+ +
  • [0.2.2] +
    • released 2023-06-28
    • +
    • now imports data.table; moving to data.table methods
    • +
    • changed news format because build_news missing previous…
    • +
  • +
  • [0.2.1] +
    • released 2023-06-21
    • +
    • updated sysdata (now using SPECIATE 5.2)
    • +
    • added sp_find_species
    • +
    • extended sp_find_profile; now searches by species_name
    • +
    • simplified plot.respeciate
    • +
  • +
  • [0.2.0] +
    • released 2021-05-26
    • +
    • Karl Ropkins joined
    • +
    • moved your sysdata.rda to the package data folder
    • +
    • reset lazy.data to TRUE
    • +
    • added an object class
    • +
    • added sp_find_profile (find_code but making object class)
    • +
    • added sp_profile (spec but making object class)
    • +
    • added crude print and plot methods for object classes
    • +
    • updated date and version
    • +
    • The new code is in R:speciate.0.2.r
    • +
  • +
  • [0.1.0] +
    • released 2020-12-20
    • +
    • Created respeciate
    • +
  • +