diff --git a/.Rbuildignore b/.Rbuildignore index 4d0c508..fc8bb42 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,6 @@ ^.*\.Rproj$ ^\.Rproj\.user$ assets +^data-raw$ +_pkgdown.yml +rsconnect \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7b732e7..f77d4d0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ .RData .Ruserdata .DS_Store +inst/doc +rsconnect \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 8ddee0e..ef97986 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,4 @@ -Package: calicomics +Package: claman Type: Package Title: Streamlined methods for analysis of metabolomics and lipidomics data at Calico Life Sciences LLC Version: 0.1.0 @@ -27,8 +27,9 @@ Imports: glue, purrr, readr, - rlang, reshape2, + rlang, + RSQLite, stringr, tibble, tidyr (>= 1.0.0) @@ -36,16 +37,14 @@ Suggests: authutils, dbplyr, fgsea, - gargle, - googledrive (>= 1.0.0), - googlesheets4, grDevices, + kableExtra, knitr, lubridate, plotly, qvalue, rmarkdown, - RMySQL, + romic, testthat Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index 7896f66..620f136 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,30 +1,22 @@ # Generated by roxygen2: do not edit by hand export(collapse_injections) -export(configure_db_access) export(diffex_mzroll) export(filter_groupIds) -export(filter_peakgroups_quo) -export(filter_samples_quo) -export(find_tracking_sheet) +export(find_pathway_enrichments) export(floor_peaks) -export(hclust_order) -export(import_sample_sheet) export(lipid_components) -export(mzroll_table) +export(merge_compounds_tbl) +export(merge_samples_tbl) export(normalize_peaks) -export(pathway_enrichments) +export(nplug_mzroll) export(plot_barplot) -export(plot_heatmap) +export(plot_compare_injection) +export(plot_pvalues) +export(plot_volcano) export(process_mzroll) export(process_mzroll_multi) -export(pvalue_histograms) -export(query_systematic_compounds) export(remove_constant_name) -export(summarize_compound_pathways) -export(volcano_plot) -export(write_tidy_list_augmented) -export(write_tidy_list_triple) -export(write_tidy_list_wide) +export(util_pretty_khead) import(ggplot2) importFrom(dplyr,"%>%") diff --git a/R/claman.R b/R/claman.R new file mode 100644 index 0000000..2af041a --- /dev/null +++ b/R/claman.R @@ -0,0 +1,146 @@ +#' \code{claman} package +#' +#' Claman (Calico Lipidomics And Metabolomics ANalysis) can read .mzrollDB +#' files created using MAVEN or Quahog into an mzroll_list. These lists +#' can then be modified through normalization, signal flooring, and filtering. +#' Differential abundance analysis of metabolites can then be performed along +#' with a variety of visualizations. +#' +#' @docType package +#' @name calicomics +#' @importFrom dplyr %>% +#' @import ggplot2 +NULL + +## quiets concerns of R CMD check re: the .'s that appear in pipelines +utils::globalVariables(c( + ".", + ":=", + ".center", + ".group_center", + ".loess_fit", + ".loess_shift", + ".reference", + ".reference_diff", + "abund_mean", + "abund_se", + "abundance", + "anova_signif", + "centered_log2_abundance", + "char", + "compoundId", + "compoundName", + "databaseId", + "datetime", + "diff_to_median", + "displayName", + "double_1", + "double_2", + "double_3", + "double_4", + "enrichment_plot", + "ES", + "FA1", + "FA2", + "FA3", + "FA4", + "fdr_summary", + "field", + "filename", + "focus_pathway", + "fgsea_results", + "groupData", + "groupId", + "gsea_results", + "gsea_table_grob", + "id", + "ID string entry", + "lm_fits", + "is_discovery", + "is_focus_pathway", + "is_tracking_sheet", + "is_unknown", + "label", + "leadingEdge", + "lipidClass", + "log2_abundance", + "median", + "median_abund", + "method_tag", + "MS ID string", + "MS ID string alternative", + "mz_delta", + "mz_median", + "mzmax", + "mzmax_adj", + "mzmin", + "mzmin_adj", + "mzroll_db_path", + "n", + "n_entries", + "n_unique_records", + "name", + "name_tibble", + "NES", + "new_compoundName", + "newSampleId", + "nMoreExtreme", + "num_sn_chains", + "OH_1", + "OH_2", + "OH_3", + "OH_4", + "old_sampleId", + "ordered_groupId", + "ordered_sampleId", + "one_peak_data", + "p.value", + "p.value.trans", + "padj", + "pathway", + "pathway_entries", + "pathwayId", + "pathwayName", + "peak_label", + "peakAreaTop", + "peakMz", + "plasmalogen_type", + "position", + "processed_mzroll", + "pval", + "qvalue", + "r_position", + "rt", + "rt_delta", + "rt_median", + "rtmax", + "rtmax_adj", + "rtmin", + "rtmin_adj", + "sampleId", + "sample description", + "samples", + "scaling_factor", + "single_1", + "single_2", + "single_3", + "single_4", + "size", + "sn1", + "sn2", + "sn3", + "sn4", + "sn_chains", + "statistic", + "sumComposition", + "systematicCompoundId", + "term", + "term_data", + "total_double", + "total_OH", + "total_single", + "tube", + "tube label", + "variable", + "weights" +)) diff --git a/R/compounds_metadata.R b/R/compounds_metadata.R new file mode 100644 index 0000000..0d83858 --- /dev/null +++ b/R/compounds_metadata.R @@ -0,0 +1,63 @@ +#' Merge Compounds Table +#' +#' Merge a table of compound information with an existing mzroll_list +#' +#' @inheritParams test_mzroll_list +#' @param compounds_tbl Table of compound metadata +#' @param join_by variable shared by mzroll peakgroups and compounds_tbl +#' to merge results by. +#' +#' @examples +#' mzroll_list <- process_mzroll(nplug_mzroll()) +#' compounds_tbl <- nplug_compounds +#' merge_compounds_tbl(mzroll_list, compounds_tbl) +#' +#' @export +merge_compounds_tbl <- function( + mzroll_list, + compounds_tbl, + join_by = "compoundName" +) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertDataFrame(compounds_tbl) + checkmate::assertString(join_by) + + # ensure that the join_by variable is unique in compounds_tbl + # (it doesn't have to be in the features table of the mzroll_list) + duplicated_compounds <- compounds_tbl %>% + dplyr::group_by(!!rlang::sym(join_by)) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::distinct(!!rlang::sym(join_by)) + + if (nrow(duplicated_compounds) > 0) { + stop(glue::glue( + "{nrow(duplicated_compounds)} {join_by} entries were not unique: + {paste(duplicated_compounds[[join_by]], collapse = ', ')}") + ) + } + + unmatched_features <- mzroll_list$features %>% + dplyr::anti_join(compounds_tbl, by = join_by) + + if (nrow(unmatched_features) == nrow(mzroll_list$features)) { + stop(glue::glue( + "No compounds could be matched to mzroll features using {join_by}" + )) + } + if (nrow(unmatched_features) > 0) { + warning(glue::glue( + "{nrow(unmatched_features)} features did not match entries + in the compound_tbl" + )) + } + + augmented_mzroll_list <- romic::update_tomic( + mzroll_list, + mzroll_list$features %>% + dplyr::left_join(compounds_tbl, by = join_by) + ) + + return(augmented_mzroll_list) +} \ No newline at end of file diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..10d433d --- /dev/null +++ b/R/data.R @@ -0,0 +1,93 @@ +#' Growth-Limiting Metabolites +#' +#' The NPLUG experiment has a full factorial design exploring relationship +#' between nutrient limitation and growth rate in yeast. +#' +#' @details +#' The experiment contains 25 primary experimental conditions: +#' \itemize{ +#' \item{5 limiting nutrients: nitrogen, phosphorous, leucine, uracil and +#' glucose (i.e., NPLUG!)} +#' \item{5 dilution rates (growth rate / ln(2)): from 0.05-0.3 1/hrs} +#' } +#' +#' Metabolomics of +#' limited yeast cultures growing at different rates. +#' +#' @return path to the NPLUG .mzrollDB dataset +#' +#' @family nplug +#' +#' @export +nplug_mzroll <- function() { + + path <- system.file("extdata", "nplug.mzrollDB", package = "claman") + if (path == "" || !file.exists(path)) { + stop("nplug_mzroll was not found") + } + + return(path) +} + + +#' NPLUG samples +#' +#' A table of meta-data for all NPLUG samples with variables: +#' \describe{ +#' \item{sample_name}{unique descriptor of each sample} +#' \item{month}{month of sample generation} +#' \item{replicate}{culture replicate} +#' \item{DR}{dilution rate of culture - this is proportional to cells' +#' growth rate} +#' \item{limitation}{nutrient limiting growth +#' \itemize{ +#' \item{GLU - carbon} +#' \item{LEU - leucine (in a Leu4 auxotroph)} +#' \item{NH4 - nitrogen} +#' \item{PO4 - phosphorous} +#' \item{URA - uracil (in a Ura2 auxotroph)} +#' } +#' } +#' \item{exp_ref}{experimental or reference condition} +#' \item{extraction}{filter- or pellet-based extraction} +#' \item{condition}{Integer value for each unique condition. Here, that is +#' a unique values of (month, limitation, DR, and extraction) for +#' experimental samples, and unique values of (month and extraction) for +#' reference samples.} +#' \item{reference}{Condition number that this sample should be compared to.} +#' } +#' +#' @family nplug +"nplug_samples" + +#' NPLUG samples +#' +#' A table of meta-data for all NPLUG features with variables: +#' \describe{ +#' \item{compoundName}{compound name corresponding to the compoundName +#' variable in the nplug_mzroll's features table.} +#' \item{pathway}{A (rough) categorization of compounds into metabolic +#' pathways.} +#' } +#' +#' @family nplug +"nplug_compounds" + +#' NPLUG MzRoll Augmented +#' +#' \link{nplug_mzroll} formatted with \link{process_mzroll} with sample +#' (\link{nplug_samples}) and compound (\link{nplug_samples}) merged. +#' +#' @family nplug +"nplug_mzroll_augmented" + +#' NPLUG MzRoll Normalized +#' +#' \link{nplug_mzroll_augmented} with injections collapsed using +#' \link{collapse_injections}, followed by reference-sample normalization +#' using \link{normalize_peaks}. Finally, reference samples and samples +#' extracting using the pellet method were removed and sample names +#' were cleaned up. These steps are described in the NPLUG vignette. +#' +#' @family nplug +"nplug_mzroll_normalized" \ No newline at end of file diff --git a/R/differential_expression.R b/R/differential_expression.R index 9e76681..8bec173 100644 --- a/R/differential_expression.R +++ b/R/differential_expression.R @@ -1,7 +1,7 @@ #' Differential Expression Analysis on Mzroll List #' #' @inheritParams test_mzroll_list -#' @inheritParams plot_heatmap +#' @param value_var measurement variable #' @param test_model a RHS formula for regression #' @param null_model if provided a null RHS formula to compare to #' \code{test_model} using the likelihood-ratio test. @@ -9,55 +9,69 @@ #' @return a tibble of significance tests for each feature #' #' @examples -#' library(dplyr) -#' -#' mzroll_list <- readRDS( -#' authutils::get_clamr_assets("X0083-mzroll-list.Rds") -#' ) %>% -#' floor_peaks(12) -#' -#' diffex_mzroll(mzroll_list, test_model = "mutation + sex") +#' diffex_mzroll( +#' nplug_mzroll_normalized, +#' "normalized_log2_abundance", +#' "limitation + limitation:DR + 0" +#' ) +#' +#' diffex_mzroll( +#' nplug_mzroll_normalized, +#' "normalized_log2_abundance", +#' "limitation + 0", +#' "0" +#' ) #' @export -diffex_mzroll <- function(mzroll_list, - value.var = "centered_log2_abundance", - test_model, - null_model = NULL) { +diffex_mzroll <- function( + mzroll_list, + value_var, + test_model, + null_model = NULL + ) { + test_mzroll_list(mzroll_list) - quant_vars <- colnames(mzroll_list$peaks)[ - purrr::map_chr(mzroll_list$peaks, class) == "numeric" + quant_vars <- colnames(mzroll_list$measurements)[ + purrr::map_chr(mzroll_list$measurements, class) == "numeric" ] - valid_quant_vars <- setdiff(quant_vars, c("peakId", "groupId")) - if (!(value.var %in% valid_quant_vars)) { + valid_quant_vars <- setdiff(quant_vars, c("groupId", "sampleId")) + if (!(value_var %in% valid_quant_vars)) { stop( - value.var, - " not present in \"mzroll_list$peaks\", valid numeric variables are:", + value_var, + " not present in \"mzroll_list$measurements\", valid numeric variables + are:", paste(valid_quant_vars, collapse = ", ") ) } - # setup data - - nested_peaks <- mzroll_list$peaks %>% - dplyr::left_join(mzroll_list$samples, "sampleId") %>% - tidyr::nest(one_peak_data = -groupId) - # setup and validate formulas - viable_sample_fields <- setdiff( - colnames(mzroll_list$samples), - c("sampleId", "tube label", "MS ID string", "MS ID string alternative") - ) + viable_sample_fields <- setdiff(colnames(mzroll_list$samples), "sampleId") test_model_formula <- validate_formulas( test_model, - value.var, + value_var, viable_sample_fields ) + # setup data + + nested_peaks <- mzroll_list$measurements %>% + dplyr::left_join(mzroll_list$samples, "sampleId") %>% + tidyr::nest(one_peak_data = -groupId) + + # determine how many distinct conditions there are from a feature + # with the most samples + n_conditions <- ncol(stats::model.matrix( + test_model_formula, + nested_peaks %>% + dplyr::mutate(n = purrr::map_int(one_peak_data, nrow)) %>% + dplyr::arrange(desc(n)) %>% + dplyr::slice(1) %>% + tidyr::unnest(one_peak_data) + )) + # flag peakgroups which cannot be fit due to too few samples - n_conditions <- length(unique(mzroll_list$samples$`condition #`)) - underspecified_groups <- nested_peaks %>% dplyr::filter(purrr::map_dbl(one_peak_data, nrow) < n_conditions + 1) @@ -85,7 +99,7 @@ diffex_mzroll <- function(mzroll_list, } else { null_model_formula <- validate_formulas( null_model, - value.var, + value_var, viable_sample_fields ) @@ -102,15 +116,27 @@ diffex_mzroll <- function(mzroll_list, # FDR control - peakgroup_signif %>% + output <- peakgroup_signif %>% tidyr::nest(term_data = -term) %>% dplyr::mutate(fdr_summary = purrr::map(term_data, diffex_fdr)) %>% dplyr::select(-term_data) %>% - tidyr::unnest(fdr_summary) + tidyr::unnest(fdr_summary) %>% + dplyr::mutate( + signif_qual = dplyr::case_when( + qvalue < 0.001 ~ " ***", + qvalue < 0.01 ~ " **", + qvalue < 0.1 ~ " *", + TRUE ~ "" + ), + diffex_label = glue::glue("{round(statistic, 3)}{signif_qual}") + ) %>% + dplyr::select(-signif_qual) + + return(output) } -validate_formulas <- function(model, value.var, viable_sample_fields) { +validate_formulas <- function(model, value_var, viable_sample_fields) { if (stringr::str_detect(model, "~")) { stop("your regression model: ", model, " should not include an \"~\"") } @@ -129,7 +155,7 @@ validate_formulas <- function(model, value.var, viable_sample_fields) { ) } - full_formula <- stats::as.formula(paste(value.var, "~", model)) + full_formula <- stats::as.formula(paste(value_var, "~", model)) return(full_formula) } @@ -166,8 +192,8 @@ diffex_fdr <- function(term_data) { q_values <- q_values[-length(q_values)] warning( - "q-value calculation initially failed but calicomics - was able to recover results" + "q-value calculation initially failed due to too many small p-values + but claman was able to recover results" ) } @@ -177,14 +203,28 @@ diffex_fdr <- function(term_data) { #' Volcano plot #' -#' @param regression_significance output of diffex_mzroll +#' @param regression_significance returned by \code{\link{diffex_mzroll}}; +#' a tibble of tests performed. #' @param max_p_trans maximum value of -log10 pvalues to plot #' @param FDR_cutoff FDR cutoff to label for significance #' +#' @returns a grob +#' +#' @examples +#' regression_significance <- diffex_mzroll( +#' nplug_mzroll_normalized, +#' "normalized_log2_abundance", +#' "limitation + limitation:DR + 0" +#' ) +#' +#' plot_volcano(regression_significance, 10, 0.1) +#' #' @export -volcano_plot <- function(regression_significance, - max_p_trans = 10, - FDR_cutoff = 0.1) { +plot_volcano <- function( + regression_significance, + max_p_trans = 10, + FDR_cutoff = 0.1 + ) { checkmate::assertDataFrame(regression_significance) stopifnot("term" %in% colnames(regression_significance)) @@ -219,10 +259,19 @@ trans_pvalues <- function(p, max_p_trans = 10) { #' P-value Histogram #' -#' @inheritParams volcano_plot +#' @inheritParams plot_volcano +#' +#' @examples +#' regression_significance <- diffex_mzroll( +#' nplug_mzroll_normalized, +#' "normalized_log2_abundance", +#' "limitation + limitation:DR + 0" +#' ) +#' +#' plot_pvalues(regression_significance) #' #' @export -pvalue_histograms <- function(regression_significance) { +plot_pvalues <- function(regression_significance) { stopifnot("data.frame" %in% class(regression_significance)) stopifnot("term" %in% colnames(regression_significance)) diff --git a/R/export.R b/R/export.R deleted file mode 100644 index 1d5111e..0000000 --- a/R/export.R +++ /dev/null @@ -1,202 +0,0 @@ - -#' Write flat tidy list -#' -#' @inheritParams test_mzroll_list -#' @param dir_path path to save outputs -#' @param name_preamble start of output file name -#' -#' @return -#' Export three tables: -#' \itemize{ -#' \item{features: one row per features measured (i.e., a metabolite)} -#' \item{sample: one row per sample} -#' \item{measurements: one row per measurement (i.e., one metabolite in -#' one sample)} -#' } -#' -#' @export -write_tidy_list_triple <- function(mzroll_list, dir_path, name_preamble) { - test_mzroll_list(mzroll_list) - - stopifnot(class(dir_path) == "character", length(dir_path) == 1) - if (!file.exists(dir_path)) { - stop("output directory does not exist: ", dir_path) - } - - stopifnot(class(name_preamble) == "character", length(name_preamble) == 1) - - for (k in names(mzroll_list)) { - readr::write_tsv( - mzroll_list[[k]], - path = file.path(dir_path, paste0(name_preamble, "_", k, ".tsv")) - ) - } - - invisible(0) -} - -#' Write flat tidy list -#' -#' @inheritParams write_tidy_list_triple -#' -#' @return -#' Export one table which is one row per peak, which includes all feature and -#' sample attributes. -#' -#' @export -write_tidy_list_augmented <- function(mzroll_list, dir_path, name_preamble) { - test_mzroll_list(mzroll_list) - - stopifnot(class(dir_path) == "character", length(dir_path) == 1) - if (!file.exists(dir_path)) { - stop("output directory does not exist: ", dir_path) - } - - stopifnot(class(name_preamble) == "character", length(name_preamble) == 1) - - mega_table <- mzroll_list$peaks %>% - dplyr::left_join(mzroll_list$peakgroups, by = "groupId") %>% - dplyr::left_join(mzroll_list$samples %>% - dplyr::select(!!!rlang::syms(setdiff(colnames(.), "method_tag"))), - by = "sampleId" - ) - - readr::write_tsv( - mega_table, - path = file.path(dir_path, paste0(name_preamble, "_augmented_table.tsv")) - ) - - invisible(0) -} - -#' Write wide output -#' -#' abundances form a matrix with metabolites as rows and samples as columns. -#' Use transpose to treat samples as rows -#' -#' @inheritParams write_tidy_list_triple -#' @inheritParams plot_heatmap -#' @param transpose if TRUE then samples will be stored as rows -#' -#' @return -#' Export one table which contains metabolites as rows and samples as columns. -#' -#' @export -write_tidy_list_wide <- function(mzroll_list, - dir_path, - name_preamble, - value.var = "log2_abundance", - transpose = FALSE) { - test_mzroll_list(mzroll_list) - - stopifnot(class(dir_path) == "character", length(dir_path) == 1) - if (!file.exists(dir_path)) { - stop("output directory does not exist: ", dir_path) - } - - stopifnot(class(name_preamble) == "character", length(name_preamble) == 1) - stopifnot(class(transpose) == "logical", length(transpose) == 1) - - # structure abundances - if (transpose) { - cast_formula <- stats::as.formula("sampleId ~ groupId") - } else { - cast_formula <- stats::as.formula("groupId ~ sampleId") - } - - abundance_matrix <- mzroll_list$peaks %>% - reshape2::acast(formula = cast_formula, value.var = value.var) - - if (transpose) { - feature_labels <- rownames(abundance_matrix) - sample_labels <- colnames(abundance_matrix) - } else { - feature_labels <- colnames(abundance_matrix) - sample_labels <- rownames(abundance_matrix) - } - - # create a top-left-null section - n_peakgroup_attr <- ncol(mzroll_list$peakgroups) - n_sample_attr <- ncol(mzroll_list$samples) - - top_left_void <- matrix(nrow = n_sample_attr, ncol = n_peakgroup_attr) - if (transpose) { - top_left_void <- t(top_left_void) - } - # remove one row to leave space for row attribute labels - top_left_void <- top_left_void[-1, , drop = FALSE] - - # setup sample and peakgroup attributes - - ordered_samples <- mzroll_list$samples %>% - dplyr::mutate(sampleId = factor(sampleId, levels = feature_labels)) %>% - dplyr::arrange(sampleId) %>% - dplyr::mutate_all(as.character) - - ordered_groups <- mzroll_list$peakgroups %>% - dplyr::mutate(groupId = factor(groupId, levels = sample_labels)) %>% - dplyr::arrange(groupId) %>% - dplyr::mutate_if(is.numeric, round, 3) %>% - dplyr::mutate_all(as.character) - - if (transpose) { - left_matrix <- rbind( - top_left_void, - matrix(colnames(ordered_samples), nrow = 1), - unname(as.matrix(ordered_samples)) - ) - - - top_right_matrix <- rbind( - matrix(colnames(ordered_groups), nrow = 1), - unname(as.matrix(ordered_groups)) - ) %>% - t() - - bottom_right_matrix <- cbind( - matrix(rownames(abundance_matrix), ncol = 1), - abundance_matrix - ) - - right_matrix <- rbind( - top_right_matrix, - bottom_right_matrix - ) - - output <- cbind(left_matrix, right_matrix) - } else { - left_matrix <- rbind( - top_left_void, - matrix(colnames(ordered_groups), nrow = 1), - unname(as.matrix(ordered_groups)) - ) - - - top_right_matrix <- rbind( - matrix(colnames(ordered_samples), nrow = 1), - unname(as.matrix(ordered_samples)) - ) %>% - t() - - bottom_right_matrix <- cbind( - matrix(rownames(abundance_matrix), ncol = 1), - abundance_matrix - ) - - right_matrix <- rbind( - top_right_matrix, - bottom_right_matrix - ) - - output <- cbind(left_matrix, right_matrix) - } - - output %>% - as.data.frame() %>% - readr::write_tsv( - path = file.path(dir_path, paste0(name_preamble, "_", "wide.tsv")), - col_names = FALSE - ) - - invisible(0) -} diff --git a/R/filter_mzroll_list.R b/R/filter_mzroll_list.R index 284e67d..b80c6a0 100644 --- a/R/filter_mzroll_list.R +++ b/R/filter_mzroll_list.R @@ -7,90 +7,26 @@ #' @param invert if TRUE then remove provided groupIds; if FALSE then retain #' provided groupIds #' +#' @examples +#' filter_groupIds(nplug_mzroll_normalized, 1:10) +#' #' @export filter_groupIds <- function(mzroll_list, groupIds, invert = FALSE) { checkmate::assertNumeric(groupIds) checkmate::assertLogical(invert, len = 1) if (invert) { - filter_groupIds <- setdiff(mzroll_list$peakgroups$groupId, groupIds) + filter_groupIds <- setdiff(mzroll_list$measurements$groupId, groupIds) } else { filter_groupIds <- groupIds } - mzroll_list <- filter_peakgroups_quo( + mzroll_list <- romic::filter_tomic( mzroll_list, - rlang::quo(groupId %in% filter_groupIds) + filter_type = "quo", + filter_table = "features", + filter_value = rlang::quo(groupId %in% filter_groupIds) ) return(mzroll_list) } - -#' Filter Calicomics Samples -#' -#' Remove samples based on sample table variables -#' -#' @inheritParams test_mzroll_list -#' @param filter_quosure quosure to use to filter samples -#' -#' @return an \code{mzroll_list} with samples (and corresponding peaks) removed -#' -#' @examples -#' mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) -#' filter_quosure <- rlang::quo(sex == "Male") -#' filter_samples_quo(mzroll_list, filter_quosure) -#' @export -filter_samples_quo <- function(mzroll_list, filter_quosure) { - test_mzroll_list(mzroll_list) - stopifnot("formula" %in% class(filter_quosure)) - - mzroll_list$samples <- mzroll_list$samples %>% - dplyr::filter(!!filter_quosure) - - reconciled_list <- reconcile_mzroll_list(mzroll_list) - - reconciled_list -} - -#' Filter Calicomics Peakgroups -#' -#' Remove peakgroups based on peakgroups table variables -#' -#' @inheritParams test_mzroll_list -#' @param filter_quosure quosure to use to filter peakgroups -#' -#' @return an \code{mzroll_list} with peakgroups (and corresponding peaks) -#' removed -#' -#' @examples -#' mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) -#' filter_quosure <- rlang::quo(focus_pathway %in% "Pyrimidine metabolism") -#' filter_peakgroups_quo(mzroll_list, filter_quosure) -#' @export -filter_peakgroups_quo <- function(mzroll_list, filter_quosure) { - test_mzroll_list(mzroll_list) - stopifnot("formula" %in% class(filter_quosure)) - - mzroll_list$peakgroups <- mzroll_list$peakgroups %>% - dplyr::filter(!!filter_quosure) - - reconciled_list <- reconcile_mzroll_list(mzroll_list) - - reconciled_list -} -#' Reconcile Mzroll List -#' -#' If some samples, peakgroups or peaks have been dropped; peaks and peakgroups -#' which may now be irrelevant -#' -#' @inheritParams test_mzroll_list -reconcile_mzroll_list <- function(mzroll_list) { - mzroll_list$peaks <- mzroll_list$peaks %>% - dplyr::semi_join(mzroll_list$samples, by = "sampleId") %>% - dplyr::semi_join(mzroll_list$peakgroups, by = "groupId") - - mzroll_list$peakgroups <- mzroll_list$peakgroups %>% - dplyr::semi_join(mzroll_list$peaks, by = "groupId") - - mzroll_list -} diff --git a/R/import_mzroll.R b/R/import_mzroll.R index ff9bd40..0b359f7 100644 --- a/R/import_mzroll.R +++ b/R/import_mzroll.R @@ -1,198 +1,48 @@ #' Process mzRoll #' #' @param mzroll_db_path path to mzroll DB file -#' @param standard_databases connections to Calico standards and systematic -#' compound databases - generated with \link{configure_db_access}. -#' @param sample_sheet_list list of googlesheets information describing -#' experimental design - generated with \link{import_sample_sheet}. -#' @param method_tag what method was run (for the purpose of matching -#' meta-data and aggregating). #' @param only_identified TRUE/FALSE, filter to only features which were #' identified. #' @param validate TRUE/FALSE, use meta-data to only name the subset of #' features which were manually validated. +#' @param method_tag what method was run (for the purpose of matching +#' meta-data and aggregating). #' -#' @return an mzroll_list containing three tibbles: +#' @return a **triple_omic** from **romic** containing three tibbles: #' \itemize{ -#' \item{peakgroups: one row per unique analyte (defined by a unique +#' \item{features: one row per unique analyte (defined by a unique #' groupId)}, #' \item{samples: one row per unique sample (defined by a unique sampleId)}, -#' \item{peaks: one row per peak (samples x peakgroups)} +#' \item{measurements: one row per peak (samples x peakgroups)} #' } +#' And, a design list which tracks the variables in each table. #' #' @export #' #' @examples -#' library(dplyr) -#' -#' authutils::get_clamr_assets("X0083-M001A.mzrollDB") %>% -#' process_mzroll( -#' standard_databases = NULL, -#' method_tag = "M001A", -#' only_identified = TRUE, -#' validate = FALSE -#' ) -process_mzroll <- function(mzroll_db_path, standard_databases = NULL, - sample_sheet_list = NULL, method_tag = "", - only_identified = TRUE, validate = FALSE) { - stopifnot(any(class(sample_sheet_list) %in% c("NULL", "list"))) - if ("list" %in% class(sample_sheet_list)) { - stopifnot(all( - names(sample_sheet_list) == c("tracking_sheet_id", "sample_sheet") - )) - } - checkmate::assertString(method_tag) +#' process_mzroll(nplug_mzroll()) +process_mzroll <- function( + mzroll_db_path, + only_identified = TRUE, + validate = FALSE, + method_tag = "" + ) { + + checkmate::assertFileExists(mzroll_db_path) checkmate::assertLogical(only_identified, len = 1) checkmate::assertLogical(validate, len = 1) - - mzroll_db_con <- authutils::create_sqlite_con(mzroll_db_path) - - # to do check validity of all sql connections - - peakgroup_positions <- dplyr::tbl(mzroll_db_con, "peaks") %>% - dplyr::collect() %>% - dplyr::group_by(groupId) %>% - dplyr::summarize( - mz = sum(peakMz * peakAreaTop) / sum(peakAreaTop), - rt = sum(rt * peakAreaTop) / sum(peakAreaTop) - ) - - peakgroups <- dplyr::tbl(mzroll_db_con, "peakgroups") %>% - dplyr::collect() %>% - dplyr::select(-compoundId) %>% - tidyr::separate(compoundName, - into = c("compoundName", "smiles"), - sep = ": ", extra = "drop", fill = "right" - ) %>% - dplyr::left_join(peakgroup_positions, by = "groupId") - - # Issue 300: case where displayName is not NA and compoundName is NA will - # currently be treated as NA. Unlikely to occur, to avoid set - # validate = FALSE - if (validate) { - peakgroups <- peakgroups %>% - dplyr::mutate( - compoundName = dplyr::case_when( - is.na(compoundName) ~ NA_character_, - searchTableName == "Bookmarks" & - !stringr::str_detect(string = label, pattern = "b") ~ compoundName, - # backwards compatibility - searchTableName == "rumsDB" & - stringr::str_detect(string = label, pattern = "g") ~ compoundName, - searchTableName == "clamDB" & - stringr::str_detect(string = label, pattern = "g") ~ compoundName, - # directly added EICs - searchTableName == "EICs" ~ compoundName, - TRUE ~ NA_character_ - ), - is_unknown = ifelse(is.na(compoundName), TRUE, FALSE) - ) - } - - if (only_identified) { - - # Issue 300: case of displayName != NA and compoundName == NA filtered - # out here. Only a problem for annotations added to peakdetector IDs - # instead of clamdb IDs (or clamDB IDs analyzed in MAVEN not using - # correct msp library) - # - # TODO: revisit if this comes up - - peakgroups <- peakgroups %>% - dplyr::filter(!is.na(compoundName)) - } else { - unknown_groups <- peakgroups %>% - dplyr::filter(is.na(compoundName)) %>% - { - .$groupId - } - - # label unknowns based on m/z and rt - - unknown_names <- peakgroups %>% - dplyr::filter(groupId %in% unknown_groups) %>% - dplyr::mutate(new_compoundName = as.character(glue::glue( - "unk {round(mz, 3)} @ {round(rt,1)}" - ))) %>% - dplyr::select(groupId, new_compoundName) - - peakgroups <- peakgroups %>% - dplyr::left_join(unknown_names, by = "groupId") %>% - dplyr::mutate( - is_unknown = ifelse(is.na(compoundName), TRUE, FALSE), - compoundName = dplyr::case_when( - !is.na(new_compoundName) ~ new_compoundName, - TRUE ~ compoundName - ) - ) %>% - dplyr::select(-new_compoundName) - } - - # add standard and systematic standard data if available - - if (class(standard_databases) != "NULL") { - - # match compounds to standards - compounds <- dplyr::tbl( - standard_databases$mass_spec_standards_con, - "compounds" - ) %>% - dplyr::collect() - - peakgroup_compounds <- peakgroups %>% - dplyr::left_join(compounds %>% - dplyr::select(compoundName, compoundId, systematicCompoundId), - by = "compoundName" - ) - - # add pathway of each compound - peakgroup_compound_pathways <- peakgroup_compounds %>% - dplyr::left_join( - query_systematic_compounds( - unique(peakgroup_compounds$systematicCompoundId) %>% - .[!is.na(.)], - standard_databases$systematic_compounds_con - ) %>% - summarize_compound_pathways( - min_pw_size = 5L, - focus_pathways = c( - "Glycolysis / Gluconeogenesis", - "Citrate cycle (TCA cycle)", - "Pentose phosphate pathway", - "Biosynthesis of amino acids", - "Purine metabolism", - "Pyrimidine metabolism" - ) - ), - by = "systematicCompoundId" - ) %>% - dplyr::mutate( - pathway = ifelse(is.na(pathway), "Other", pathway), - focus_pathway = ifelse(is.na(focus_pathway), "Other", focus_pathway) - ) - - if (!only_identified) { - peakgroup_compound_pathways <- peakgroup_compound_pathways %>% - dplyr::mutate( - pathway = ifelse(is_unknown, "Unidentified", pathway), - focus_pathway = ifelse(is_unknown, "Unidentified", focus_pathway) - ) - } - - annotated_peakgroups <- peakgroup_compound_pathways - } else { - annotated_peakgroups <- peakgroups - } - - # Issue 300: prefer manually overridden compound name - if ("displayName" %in% colnames(annotated_peakgroups)) { - annotated_peakgroups <- annotated_peakgroups %>% - dplyr::mutate(compoundName = ifelse( - is.na(displayName), - compoundName, - displayName - )) - } + checkmate::assertString(method_tag) + + # connect to SQLite .mzrollDB database + mzroll_db_con <- create_sqlite_con(mzroll_db_path) + + # add peakgroup m/z and rt + peakgroups <- process_mzroll_load_peakgroups(mzroll_db_con) + # if validate is true, use peakgroup labels to rename unknowns + peakgroups <- process_mzroll_validate_peakgroups(peakgroups, validate) + # if only_identified is true, only named compounds are retained, if false + # unknowns are labeled by m/z and rt + peakgroups <- process_mzroll_identify_peakgroups(peakgroups, only_identified) # reduce to smaller number of peakgroups features @@ -206,8 +56,6 @@ process_mzroll <- function(mzroll_db_path, standard_databases = NULL, "mz", "rt", "systematicCompoundId", - "pathway", - "focus_pathway", "compoundDB", "searchTableName", "label" @@ -215,11 +63,11 @@ process_mzroll <- function(mzroll_db_path, standard_databases = NULL, detected_peakgroup_reduced_vars <- intersect( possible_peakgroup_reduced_vars, - colnames(annotated_peakgroups) + colnames(peakgroups) ) - if (nrow(annotated_peakgroups) == 0) { - reduced_peakgroups <- annotated_peakgroups %>% + if (nrow(peakgroups) == 0) { + reduced_peakgroups <- peakgroups %>% dplyr::select(!!!rlang::syms(detected_peakgroup_reduced_vars)) %>% dplyr::mutate( peak_label = "", @@ -228,7 +76,7 @@ process_mzroll <- function(mzroll_db_path, standard_databases = NULL, warning("No named compounds were found; an empty dataset will be returned") } else { - reduced_peakgroups <- annotated_peakgroups %>% + reduced_peakgroups <- peakgroups %>% dplyr::select(!!!rlang::syms(detected_peakgroup_reduced_vars)) %>% dplyr::group_by(compoundName) %>% dplyr::mutate(peak_label = dplyr::case_when( @@ -236,7 +84,9 @@ process_mzroll <- function(mzroll_db_path, standard_databases = NULL, TRUE ~ paste0(compoundName, " (", 1:dplyr::n(), ")") )) %>% dplyr::ungroup() %>% - dplyr::mutate(method_tag = method_tag) + dplyr::mutate(method_tag = method_tag) %>% + dplyr::arrange(groupId) %>% + dplyr::mutate(groupId = factor(groupId, levels = groupId)) } debugr::dwatch( @@ -245,189 +95,229 @@ process_mzroll <- function(mzroll_db_path, standard_databases = NULL, # summarize distinct peaks and samples - peaks <- dplyr::tbl(mzroll_db_con, dbplyr::sql("SELECT * FROM peaks")) %>% + samples <- dplyr::tbl( + mzroll_db_con, + dbplyr::sql("SELECT sampleId, name, filename FROM samples") + ) %>% dplyr::collect() %>% - dplyr::semi_join(annotated_peakgroups, by = "groupId") %>% + dplyr::arrange(sampleId) %>% + dplyr::mutate( + sampleId = factor(sampleId, levels = sampleId) + ) + + debugr::dwatch( + msg = "Summarized samples. [calicomics::process_mzroll]\n" + ) + + peaks <- dplyr::tbl( + mzroll_db_con, + dbplyr::sql("SELECT groupId, sampleId, peakAreaTop FROM peaks") + ) %>% + dplyr::collect() %>% + dplyr::semi_join(peakgroups, by = "groupId") %>% dplyr::group_by(groupId) %>% dplyr::mutate( log2_abundance = log2(peakAreaTop), centered_log2_abundance = log2_abundance - mean(log2_abundance) ) %>% dplyr::select( - peakId, groupId, sampleId, log2_abundance, centered_log2_abundance ) %>% dplyr::ungroup() %>% - dplyr::mutate(sampleId = as.character(sampleId)) - - debugr::dwatch( - msg = "Summarized peaks. [calicomics::process_mzroll]\n" - ) - - samples <- dplyr::tbl( - mzroll_db_con, - dbplyr::sql("SELECT * FROM samples") - ) %>% - dplyr::collect() %>% - dplyr::select(sampleId, name, filename) %>% dplyr::mutate( - sampleId = as.character(sampleId), - method_tag = method_tag - ) + groupId = factor( + groupId, + levels = levels(reduced_peakgroups$groupId) + ), + sampleId = factor( + sampleId, + levels = levels(samples$sampleId) + )) debugr::dwatch( - msg = "Summarized samples. [calicomics::process_mzroll]\n" + msg = "Summarized peaks. [calicomics::process_mzroll]\n" ) # disconnect for sql-lite DBI::dbDisconnect(mzroll_db_con) - # add sample meta-data - - if (class(sample_sheet_list) == "list") { - samples <- augment_samples_with_samplesheet(samples, sample_sheet_list) - - # drop data from unmatched samples - peaks <- peaks %>% - dplyr::semi_join(samples, by = "sampleId") - reduced_peakgroups <- reduced_peakgroups %>% - dplyr::semi_join(peaks, by = "groupId") - } - debugr::dwatch( msg = "About to create list... [calicomics::process_mzroll]\n" ) - mzroll_list <- list( - peakgroups = reduced_peakgroups, - samples = samples, - peaks = peaks + mzroll_list <- romic::create_triple_omic( + measurement_df = peaks, + feature_df = reduced_peakgroups, + sample_df = samples, + feature_pk = "groupId", + sample_pk = "sampleId", + omic_type_tag = "mzroll" ) - + debugr::dwatch( msg = "Created list. [calicomics::process_mzroll]\n" ) - debugr::dwatch(msg = paste( - "number of samples in mzroll_list$samples:", - base::toString(nrow(mzroll_list$samples)) - )) + debugr::dwatch(msg = glue::glue( + "number of samples in mzroll_list$samples: {nrow(mzroll_list$samples)}" + )) + test_mzroll_list(mzroll_list) + return(mzroll_list) } -#' Augment Samples with Samplesheet -#' -#' @param samples samples table generated within \link{process_mzroll}. -#' @inheritParams process_mzroll -#' -#' @return samples with added metadata from sample_sheet_list -augment_samples_with_samplesheet <- function(samples, sample_sheet_list) { - ms_id_strings <- sample_sheet_list$sample_sheet %>% - dplyr::select(`tube label`, `MS ID string`, `MS ID string alternative`) %>% - dplyr::mutate( - `MS ID string` = as.character(`MS ID string`), - `MS ID string alternative` = as.character(`MS ID string alternative`) - ) %>% - tidyr::gather("ID string column", "ID string entry", -`tube label`) %>% - dplyr::filter(!is.na(`ID string entry`)) - - # check for sample duplication - duplicated_samples <- samples %>% - dplyr::count(name) %>% - dplyr::filter(n > 1) - - if (nrow(duplicated_samples) != 0) { - stop(nrow(duplicated_samples), " samples were duplicated likely due to - https://github.com/calico/mass_spec/issues/348, please patch your dataset - with calicomics::issue_348_patch(mzroll_db_path)") +#' Create SQLite Connection +#' +#' Open a connection to an SQLite database +#' +#' @param sqlite_path path to sqlite file +#' +#' @return sqlite connection +create_sqlite_con <- function(sqlite_path) { + + checkmate::assertFileExists(sqlite_path) + + if ("authutils" %in% utils::installed.packages()) { + sqlite_con <- authutils::create_sqlite_con(sqlite_path) + } else { + sqlit_con <- DBI::dbConnect( + RSQLite::SQLite(), + sqlite_path, + synchronous = NULL + ) } - - #' Issue 297: This block was adjusted, and ultimately reverted - #' Make sure there are only 1-1 substring matches between samples im - #' dataset and sample sheet - #' - #' e. g. if samples are named "C_200_B.mzML" and "C_2.mzML", "C_2" - #' as an MS ID string will match to both samples. - #' To avoid this problem in this case, the MS ID string could be - #' "C_2." or "C_2.mzML" instead of "C_2". - #' - sample_ms_id_string_matches <- samples %>% - tidyr::crossing(ms_id_strings) %>% - dplyr::filter(stringr::str_detect(name, `ID string entry`)) - - sample_multimatch <- sample_ms_id_string_matches %>% - dplyr::count(name) %>% - dplyr::filter(n > 1) - - if (nrow(sample_multimatch) != 0) { - multimatch_details <- sample_ms_id_string_matches %>% - dplyr::semi_join(sample_multimatch, by = "name") %>% - dplyr::arrange(name) %>% - dplyr::mutate(out_string = glue::glue( - "name: {name}, match_tube: {`tube label`}, id string column: {`ID string column`}, id string entry: {`ID string entry`}" - )) %>% - { - paste(.$out_string, collapse = "\n") - } - - stop( - nrow(sample_multimatch), - " experimental samples matched multiple MS ID strings: ", - paste(sample_multimatch$name, collapse = ", "), - "\nDetails:\n", - multimatch_details + + return(sqlite_con) +} + + +#' Process MzRoll - Load Peakgroups +#' +#' Load peakgroups table and add mz and rt variables to peakgroups based on +#' peak-level data. +#' +#' @param mzroll_db_con an SQLite connection to an mzrollDB database +#' +#' @return a table of peakgroups +process_mzroll_load_peakgroups <- function(mzroll_db_con){ + + peakgroup_positions <- dplyr::tbl(mzroll_db_con, "peaks") %>% + dplyr::collect() %>% + dplyr::group_by(groupId) %>% + dplyr::summarize( + mz = sum(peakMz * peakAreaTop) / sum(peakAreaTop), + rt = sum(rt * peakAreaTop) / sum(peakAreaTop) ) + + peakgroups <- dplyr::tbl(mzroll_db_con, "peakgroups") %>% + dplyr::collect() %>% + dplyr::select(-compoundId) %>% + # peel off smiles if they are present + tidyr::separate( + compoundName, + into = c("compoundName", "smiles"), + sep = ": ", extra = "drop", fill = "right" + ) %>% + dplyr::left_join(peakgroup_positions, by = "groupId") + + # if provided, prefer display name over compoundName + if ("displayName" %in% colnames(peakgroups)) { + peakgroups <- peakgroups %>% + dplyr::mutate(compoundName = ifelse( + is.na(displayName), + compoundName, + displayName + )) } + + return(peakgroups) +} - condition_multimatch <- sample_ms_id_string_matches %>% - dplyr::count(`tube label`) %>% - dplyr::filter(n > 1) - - if (nrow(condition_multimatch) != 0) { - multimatch_details <- sample_ms_id_string_matches %>% - dplyr::semi_join(condition_multimatch, by = "tube label") %>% - dplyr::arrange(`tube label`) %>% - dplyr::mutate(out_string = glue::glue( - "tube: {`tube label`}, name_match: {name}, id string column: {`ID string column`}, id string entry: {`ID string entry`}" - )) %>% - { - paste(.$out_string, collapse = "\n") - } - - stop( - nrow(sample_multimatch), - " tubes matched multiple experimental samples: ", - paste(condition_multimatch$`tube label`, collapse = ", "), - "\nDetails:\n", - multimatch_details - ) +#' Process MzRoll - Validate Peakgroups +#' +#' If validate is True then remove unvalidated compoundNames (based on +#' peakgroup labels) and identify all unvalidated compounds with an +#' "is_unknown" variable. +#' +#' @param peakgroups a table of distinct ions with characteristic m/z and rt +#' @inheritParams process_mzroll +#' +#' @return a table of peakgroups +process_mzroll_validate_peakgroups <- function(peakgroups, validate){ + + checkmate::assertDataFrame(peakgroups) + checkmate::assertLogical(validate, len = 1) + + if (validate) { + peakgroups <- peakgroups %>% + dplyr::mutate( + compoundName = dplyr::case_when( + is.na(compoundName) ~ NA_character_, + searchTableName == "Bookmarks" & + !stringr::str_detect(string = label, pattern = "b") ~ compoundName, + # backwards compatibility + searchTableName == "rumsDB" & + stringr::str_detect(string = label, pattern = "g") ~ compoundName, + searchTableName == "clamDB" & + stringr::str_detect(string = label, pattern = "g") ~ compoundName, + # directly added EICs + searchTableName == "EICs" ~ compoundName, + TRUE ~ NA_character_ + ), + is_unknown = ifelse(is.na(compoundName), TRUE, FALSE) + ) } + + return(peakgroups) + +} - # add sample fields for mathcing ID strings - - matched_augmented_samples <- samples %>% - dplyr::inner_join(sample_ms_id_string_matches %>% - dplyr::select(sampleId, `tube label`) %>% - dplyr::left_join(sample_sheet_list$sample_sheet, by = "tube label"), - by = "sampleId" - ) - - unmatched_samples <- samples %>% - dplyr::anti_join(sample_ms_id_string_matches, by = "sampleId") - - if (nrow(unmatched_samples) != 0) { - warning( - nrow(unmatched_samples), - " experimental samples were not matched to MS ID strings. Their measurements will be discarded.:\n ", - paste(unmatched_samples$name, collapse = "\n ") - ) +#' Process MzRoll - Identify Peakgroups +#' +#' Either remove unidentified peakgroups or name unknowns using m/z and rt. +#' +#' @inheritParams process_mzroll_validate_peakgroups +#' @inheritParams process_mzroll +#' +#' @return a table of peakgroups +process_mzroll_identify_peakgroups <- function(peakgroups, only_identified){ + + checkmate::assertDataFrame(peakgroups) + + if (only_identified) { + peakgroups <- peakgroups %>% + dplyr::filter(!is.na(compoundName)) + } else { + unknown_groups <- peakgroups %>% + dplyr::filter(is.na(compoundName)) %>% + {.$groupId} + + # label unknowns based on m/z and rt + + unknown_names <- peakgroups %>% + dplyr::filter(groupId %in% unknown_groups) %>% + dplyr::mutate(new_compoundName = as.character(glue::glue( + "unk {round(mz, 3)} @ {round(rt,1)}" + ))) %>% + dplyr::select(groupId, new_compoundName) + + peakgroups <- peakgroups %>% + dplyr::left_join(unknown_names, by = "groupId") %>% + dplyr::mutate( + is_unknown = ifelse(is.na(compoundName), TRUE, FALSE), + compoundName = dplyr::case_when( + !is.na(new_compoundName) ~ new_compoundName, + TRUE ~ compoundName + ) + ) %>% + dplyr::select(-new_compoundName) } - - matched_augmented_samples + + return(peakgroups) } #' Process mzroll multi @@ -440,6 +330,7 @@ augment_samples_with_samplesheet <- function(samples, sample_sheet_list) { #' \item{mzroll_db_path: path to a mzrollDB file} #' } #' @inheritParams process_mzroll +#' @inheritParams merge_samples_tbl #' #' @return an mzroll_list containing three tibbles: #' \itemize{ @@ -448,35 +339,59 @@ augment_samples_with_samplesheet <- function(samples, sample_sheet_list) { #' \item{samples: one row per unique sample (defined by a unique sampleId)}, #' \item{peaks: one row per peak (samples x peakgroups)} #' } -#' +#' +#' @examples +#' mzroll_paths <- tibble::tribble( +#' ~method_tag, ~mzroll_db_path, +#' "method1", nplug_mzroll(), +#' "method2", nplug_mzroll() +#' ) +#' +#' process_mzroll_multi(mzroll_paths, nplug_samples, "sample_name") +#' #' @export -process_mzroll_multi <- function(mzroll_paths, - standard_databases, - sample_sheet_list, - only_identified = TRUE, - validate = FALSE) { - checkmate::assertDataFrame(mzroll_paths) +process_mzroll_multi <- function( + mzroll_paths, + samples_tbl, + id_strings, + only_identified = TRUE, + validate = FALSE, + exact = TRUE + ) { + + checkmate::assertDataFrame(mzroll_paths, min.rows = 2) if (!all(colnames(mzroll_paths) == c("method_tag", "mzroll_db_path"))) { stop("mzroll_paths must contain two columns: method_tag & mnzroll_db_path") } stopifnot(length(mzroll_paths$method_tag) == length(unique(mzroll_paths$method_tag))) - checkmate::assertClass(sample_sheet_list, "list") - stopifnot(names(sample_sheet_list) == c("tracking_sheet_id", "sample_sheet")) - - # call process_mzroll once per dataset + checkmate::assertDataFrame(samples_tbl) + checkmate::assertLogical(only_identified, len = 1) + checkmate::assertLogical(validate, len = 1) + mzroll_list_nest <- mzroll_paths %>% - dplyr::mutate(processed_mzroll = purrr::map2( - mzroll_db_path, - method_tag, - process_mzroll, - standard_databases = standard_databases, - sample_sheet_list = sample_sheet_list, - only_identified = only_identified, - validate = validate - )) - - aggregate_mzroll_list <- aggregate_mzroll_nest(mzroll_list_nest) + dplyr::mutate( + # read each dataset + mzroll_list = purrr::map2( + mzroll_db_path, + method_tag, + process_mzroll, + only_identified = only_identified, + validate = validate + ), + # add sample meta-data + mzroll_list = purrr::map( + mzroll_list, + merge_samples_tbl, + samples_tbl = samples_tbl, + id_strings = id_strings, + exact = exact + )) + + aggregate_mzroll_list <- aggregate_mzroll_nest( + mzroll_list_nest, + samples_tbl + ) mzroll_multi_qc(aggregate_mzroll_list) @@ -489,103 +404,123 @@ process_mzroll_multi <- function(mzroll_paths, #' #' @param mzroll_list_nest a nested list of mzroll_lists produced from #' \link{process_mzroll}. +#' @inheritParams merge_samples_tbl #' #' @return a single mzroll_list -aggregate_mzroll_nest <- function(mzroll_list_nest) { - - # identify samples with shared "tube label" +aggregate_mzroll_nest <- function(mzroll_list_nest, samples_tbl) { + + checkmate::assertDataFrame(mzroll_list_nest) + # check that all mzroll lists have the same design + + designs <- purrr::map(mzroll_list_nest$mzroll_list, function(x) {x$design}) + if (length(unique(designs)) > 1) { + stop("mzroll_lists have different designs and cannot be aggregated") + } + + # identify samples with shared "samples_tbl_row" mzroll_list_all_samples <- mzroll_list_nest %>% - dplyr::mutate(samples = purrr::map(processed_mzroll, function(x) { + dplyr::mutate(samples = purrr::map(mzroll_list, function(x) { x$samples })) %>% - dplyr::select(-method_tag, -processed_mzroll) %>% + dplyr::select(-method_tag, -mzroll_list) %>% tidyr::unnest(samples) consensus_samples <- mzroll_list_all_samples %>% - dplyr::mutate(sampleId = `tube label`) %>% - dplyr::select(-method_tag, -mzroll_db_path, -name, -filename) %>% + dplyr::mutate(sampleId = samples_tbl_row) %>% + dplyr::select(sampleId, samples_tbl_row, !!!rlang::syms(colnames(samples_tbl))) %>% dplyr::group_by(sampleId) %>% dplyr::slice(1) %>% - dplyr::ungroup() + dplyr::ungroup() %>% + dplyr::mutate(sampleId = factor(sampleId, levels = sampleId)) # iterate through mzrollDB - max_peakId <- 0 max_groupId <- 0 - mzroll_indecies <- 1:nrow(mzroll_list_nest) # samples defined upfront as the union of samples from the sample sheet - # that are observed "tube label" will be the new sampleId + # that are observed "samples_tbl_row" will be the new sampleId updated_mzroll_list <- list( - peakgroups = NULL, + features = NULL, samples = consensus_samples, - peaks = NULL + measurements = NULL ) # peaks and peakgroups are updated to unique values across each method - # and sampleId is updated to "tube label" + # and sampleId is updated to "samples_tbl_row" for (a_mzroll_index in mzroll_indecies) { - one_mzroll_list <- mzroll_list_nest$processed_mzroll[[a_mzroll_index]] - stopifnot( - all(c("peakgroups", "samples", "peaks") %in% names(one_mzroll_list)) - ) - - # update groupid and peakId to new unique values - - one_mzroll_list$peakgroups$groupId <- - one_mzroll_list$peakgroups$groupId + max_groupId - one_mzroll_list$peaks$peakId <- - one_mzroll_list$peaks$peakId + max_peakId - one_mzroll_list$peaks$groupId <- - one_mzroll_list$peaks$groupId + max_groupId - - # find sample updates sampleId -> tube label + + one_mzroll_list <- mzroll_list_nest$mzroll_list[[a_mzroll_index]] + checkmate::assertClass(one_mzroll_list, "tomic") + checkmate::assertClass(one_mzroll_list, "mzroll") + + # update groupId to new unique values + # temporarily convert from factors -> integers so they can be added + + one_mzroll_list[["features"]]$groupId <- + as.integer(one_mzroll_list[["features"]]$groupId) + max_groupId + one_mzroll_list[["measurements"]]$groupId <- + as.integer(one_mzroll_list[["measurements"]]$groupId) + max_groupId + + # find sample updates sampleId -> samples_tbl_row sampleId_lookup <- one_mzroll_list$samples %>% - dplyr::select(oldSampleId = sampleId, newSampleId = `tube label`) + dplyr::select(oldSampleId = sampleId, newSampleId = samples_tbl_row) - one_mzroll_list$peaks <- one_mzroll_list$peaks %>% + one_mzroll_list$measurements <- one_mzroll_list$measurements %>% dplyr::left_join(sampleId_lookup, by = c("sampleId" = "oldSampleId")) %>% - dplyr::select( - peakId, - groupId, - sampleId = newSampleId, - log2_abundance, - centered_log2_abundance - ) - - updated_mzroll_list$peaks <- dplyr::bind_rows( - updated_mzroll_list$peaks, - one_mzroll_list$peaks + dplyr::select(-sampleId) %>% + dplyr::rename(sampleId = newSampleId) %>% + dplyr::mutate(sampleId = factor( + sampleId, + levels = levels(consensus_samples$sampleId) + )) %>% + dplyr::select(!!!rlang::syms(colnames(one_mzroll_list$measurements))) + + updated_mzroll_list$measurements <- dplyr::bind_rows( + updated_mzroll_list$measurements, + one_mzroll_list$measurements ) - updated_mzroll_list$peakgroups <- dplyr::bind_rows( - updated_mzroll_list$peakgroups, - one_mzroll_list$peakgroups + updated_mzroll_list$features <- dplyr::bind_rows( + updated_mzroll_list$features, + one_mzroll_list$features ) - max_peakId <- max(one_mzroll_list$peaks$peakId) - max_groupId <- max(one_mzroll_list$peakgroups$groupId) + max_groupId <- max(one_mzroll_list[["features"]]$groupId) } - updated_mzroll_list + # overwrite one of the original mzrolls to update the romic schema + + one_mzroll_list$features <- updated_mzroll_list$features %>% + dplyr::mutate(groupId = factor(groupId, levels = groupId)) + one_mzroll_list$measurements <- updated_mzroll_list$measurements %>% + dplyr::mutate( + groupId = factor( + groupId, + levels = levels(one_mzroll_list$features$groupId) + )) + # update values and schema + one_mzroll_list <- romic::update_tomic(one_mzroll_list, updated_mzroll_list$samples) + + return(one_mzroll_list) } mzroll_multi_qc <- function(mzroll_list) { - mzroll_coverage_table <- mzroll_list$peaks %>% + + mzroll_coverage_table <- mzroll_list$measurements %>% dplyr::left_join( - mzroll_list$peakgroups %>% + mzroll_list$features %>% dplyr::select(groupId, method_tag), by = "groupId" ) %>% dplyr::left_join( mzroll_list$samples %>% - dplyr::select(sampleId, `tube label`), + dplyr::select(sampleId, samples_tbl_row), by = "sampleId" ) %>% - dplyr::count(`tube label`, method_tag) %>% + dplyr::count(samples_tbl_row, method_tag) %>% tidyr::spread(method_tag, n) missed_matches <- mzroll_coverage_table %>% @@ -593,10 +528,10 @@ mzroll_multi_qc <- function(mzroll_list) { if (nrow(missed_matches) != 0) { missed_matches_warning <- missed_matches %>% - tidyr::gather(-`tube label`, key = "method_tag", value = "n_entries") %>% + tidyr::gather(-samples_tbl_row, key = "method_tag", value = "n_entries") %>% dplyr::filter(is.na(n_entries)) %>% dplyr::mutate(out = glue::glue( - "tube: {`tube label`}, method_tag: {method_tag}" + "row: {samples_tbl_row}, method_tag: {method_tag}" )) %>% { paste(.$out, collapse = "\n") @@ -604,7 +539,7 @@ mzroll_multi_qc <- function(mzroll_list) { stop( nrow(missed_matches), - " tubes only matched a subset of methods; this will cause downstream problems. + " rows only matched a subset of methods; this will cause downstream problems. Either update your sample sheet or analyze each method separately. Details:\n", missed_matches_warning @@ -613,3 +548,4 @@ mzroll_multi_qc <- function(mzroll_list) { return(invisible(0)) } + diff --git a/R/injections.R b/R/injections.R new file mode 100644 index 0000000..c709ba3 --- /dev/null +++ b/R/injections.R @@ -0,0 +1,216 @@ +#' Collapse Injections +#' +#' @inheritParams test_mzroll_list +#' @param grouping_vars character vector of sample variables to use for +#' grouping +#' @param peak_quant_vars character vector of quantification variables to +#' average +#' @param collapse_fxn function name to use for collapse +#' +#' @return a \code{\link{process_mzroll}} collapsed across grouping_vars +#' +#' @examples +#' collapse_injections( +#' nplug_mzroll_augmented, +#' grouping_vars = "condition", +#' peak_quant_vars = "log2_abundance" +#' ) +#' @export +collapse_injections <- function( + mzroll_list, + grouping_vars, + peak_quant_vars, + collapse_fxn = "mean" +) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertCharacter(grouping_vars, min.len = 1) + purrr::walk( + grouping_vars, + checkmate::assertChoice, + choices = mzroll_list$design$samples$variable + ) + checkmate::assertCharacter(peak_quant_vars, min.len = 1) + purrr::walk( + peak_quant_vars, + checkmate::assertChoice, + choices = mzroll_list$design$measurements$variable + ) + checkmate::assertString(collapse_fxn) + stopifnot(length(utils::find(collapse_fxn, mode="function")) >= 1) + + # reduce samples to fields which are defined unambiguously with grouping vars + + found_injections <- find_injections(mzroll_list, grouping_vars) + new_samples <- found_injections$new_samples + collapse_dict <- found_injections$collapse_dict + + new_measurements <- mzroll_list$measurements %>% + dplyr::rename(old_sampleId = sampleId) %>% + dplyr::left_join(collapse_dict, by = "old_sampleId") %>% + dplyr::select(-old_sampleId) %>% + dplyr::group_by(groupId, sampleId) %>% + dplyr::summarize_each(funs = collapse_fxn, peak_quant_vars) %>% + dplyr::ungroup() + + # update sample and measurements and the schema + mzroll_list <- romic::update_tomic(mzroll_list, new_measurements) + mzroll_list <- romic::update_tomic(mzroll_list, new_samples) + + return(mzroll_list) +} + +n_unique <- function(x) { + length(unique(x)) +} + +#' Compare Injections +#' +#' Create a scatterplot of injection correlations. +#' +#' @inheritParams collapse_injections +#' @param peak_quant_var variable to plot in comparison +#' +#' @examples +#' plot_compare_injection( +#' nplug_mzroll_augmented, +#' grouping_vars = "condition", +#' peak_quant_var = "log2_abundance" +#' ) +#' +#' @export +plot_compare_injection <- function( + mzroll_list, + grouping_vars = "condition", + peak_quant_var = "log2_abundance" + ) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertCharacter(grouping_vars, min.len = 1) + purrr::walk( + grouping_vars, + checkmate::assertChoice, + choices = mzroll_list$design$samples$variable + ) + checkmate::assertChoice( + peak_quant_var, + mzroll_list$design$measurements$variable + ) + + found_injections <- find_injections(mzroll_list, grouping_vars) + collapse_dict <- found_injections$collapse_dict + + tidy_mzroll_list_data <- romic::triple_to_tidy(mzroll_list)$data %>% + dplyr::select( + groupId, + old_sampleId = sampleId, + !!rlang::sym(peak_quant_var) + ) %>% + dplyr::left_join(collapse_dict, by = "old_sampleId") %>% + dplyr::mutate(.entry = 1:dplyr::n()) + + quant_comparison <- tidy_mzroll_list_data %>% + # generate all x all comparisons of sampe peakgroup and same unique sample + dplyr::rename(quant_1 = !!rlang::sym(peak_quant_var)) %>% + dplyr::inner_join( + tidy_mzroll_list_data %>% + dplyr::rename(quant_2 = !!rlang::sym(peak_quant_var)), + by = c("groupId", "sampleId")) %>% + # remove self comparisons, and only retain one pair for each {A-B, B-A} + dplyr::filter(.entry.x < .entry.y) + + R_squared <- round(stats::cor( + quant_comparison$quant_1, + quant_comparison$quant_2 + )^2, 3) + + label_dat <- tibble::tibble( + quant_1 = min(quant_comparison$quant_1), + quant_2 = max(quant_comparison$quant_2), + ) + + bivariate_hex <- ggplot(quant_comparison, aes(x = quant_1, y = quant_2)) + + geom_hex(bins = 50) + + geom_abline(intercept = 0, slope = 1, size = 1, color = "gray50") + + geom_label(hjust = 0, vjust = 1, parse = TRUE, + data = label_dat, + label = glue::glue("R^2 ~ \"=\" ~ {R_squared}") + ) + + scale_fill_viridis_c(trans = "log2", option = "plasma") + + coord_equal() + + scale_x_continuous(paste0(stringr::str_replace_all( + peak_quant_var, "[-_]", " " + ), " A"), expand = c(0,0)) + + scale_y_continuous(paste0(stringr::str_replace_all( + peak_quant_var, "[-_]", " " + ), " B"), expand = c(0,0)) + + theme_minimal() + + theme(axis.line = element_line(color = "black")) + + return(bivariate_hex) +} + +find_injections <- function(mzroll_list, grouping_vars) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertCharacter(grouping_vars, min.len = 1) + purrr::walk( + grouping_vars, + checkmate::assertChoice, + choices = mzroll_list$design$samples$variable + ) + + not_grouping_vars <- setdiff(colnames(mzroll_list$samples), grouping_vars) + + consistent_reduced_fields <- mzroll_list$samples %>% + dplyr::group_by(!!!syms(grouping_vars)) %>% + dplyr::summarize_all(n_unique) %>% + tidyr::gather(field, n_unique_records, !!!(not_grouping_vars)) %>% + dplyr::group_by(field) %>% + dplyr::filter(all(n_unique_records == 1)) %>% + dplyr::distinct(field) %>% + dplyr::ungroup() + + # names aren't consistent but should still be maintained + reduced_names <- mzroll_list$samples %>% + dplyr::select(name, !!!syms(grouping_vars)) %>% + dplyr::group_by(!!!syms(grouping_vars)) %>% + dplyr::slice(1) %>% + dplyr::ungroup() + + new_samples <- mzroll_list$samples %>% + dplyr::select(c(!!!rlang::syms(grouping_vars), consistent_reduced_fields$field)) %>% + dplyr::group_by(!!!syms(grouping_vars)) %>% + dplyr::slice(1) %>% + dplyr::ungroup() %>% + dplyr::mutate( + sampleId = as.character(1:dplyr::n()), + sampleId = factor(sampleId, levels = sampleId) + ) %>% + dplyr::left_join(reduced_names, by = grouping_vars) + + dropped_fields <- setdiff(colnames(mzroll_list$samples), colnames(new_samples)) + if (length(dropped_fields) > 0) { + message(glue::glue( + "{length(dropped_fields)} sample variables will be dropped since they + - vary for the same grouping_vars: + - {paste(dropped_fields, collapse = ', ')}" + )) + } + + collapse_dict <- mzroll_list$samples %>% + dplyr::select(old_sampleId = sampleId, !!!syms(grouping_vars)) %>% + dplyr::left_join(new_samples %>% + dplyr::select(sampleId, grouping_vars), + by = grouping_vars + ) %>% + dplyr::select(old_sampleId, sampleId) + + return(list( + new_samples = new_samples, + collapse_dict = collapse_dict + )) +} diff --git a/R/mutate_mzroll_list.R b/R/mutate_mzroll_list.R index 9ec9896..d953e87 100644 --- a/R/mutate_mzroll_list.R +++ b/R/mutate_mzroll_list.R @@ -9,21 +9,24 @@ #' #' @return \code{\link{process_mzroll}} #' +#' @examples +#' floored_peaks <- floor_peaks(nplug_mzroll_augmented, 12) +#' #' @export floor_peaks <- function(mzroll_list, log2_floor_value = 12) { # summarize peaks associated with each peakgroup missing_peaks <- tidyr::expand_grid( - groupId = mzroll_list$peakgroups$groupId, + groupId = mzroll_list$features$groupId, sampleId = mzroll_list$samples$sampleId ) %>% - dplyr::anti_join(mzroll_list$peaks, by = c("groupId", "sampleId")) %>% + dplyr::anti_join(mzroll_list$measurements, by = c("groupId", "sampleId")) %>% tibble::as_tibble() %>% dplyr::mutate(log2_abundance = log2_floor_value) # combine detected peaks with peaks that were missing for some samples completed_peaks <- dplyr::bind_rows( - mzroll_list$peaks %>% + mzroll_list$measurements %>% dplyr::mutate(log2_abundance = dplyr::case_when( is.na(log2_abundance) ~ log2_floor_value, log2_abundance < log2_floor_value ~ log2_floor_value, @@ -37,103 +40,11 @@ floor_peaks <- function(mzroll_list, log2_floor_value = 12) { ) %>% dplyr::ungroup() - mzroll_list$peaks <- completed_peaks - - mzroll_list -} - -#' Collapse Injections -#' -#' @inheritParams test_mzroll_list -#' @param grouping_vars character vector of sample variables to use for -#' grouping -#' @param peak_quant_vars character vector of quantification variables to -#' average -#' @param collapse_fxn function to use for collapse -#' -#' @return a \code{\link{process_mzroll}} collapsed across grouping_vars -#' -#' @export -collapse_injections <- function(mzroll_list, - grouping_vars, - peak_quant_vars, - collapse_fxn = "mean") { - stopifnot( - all(class(grouping_vars) %in% "character"), - length(grouping_vars) >= 1 - ) - missing_grouping_vars <- setdiff( - grouping_vars, - colnames(mzroll_list$samples) - ) - if (length(missing_grouping_vars) != 0) { - stop( - length(missing_grouping_vars), - " variables were missing from the samples table: ", - paste(missing_grouping_vars, collapse = ", ") - ) - } - - stopifnot( - all(class(peak_quant_vars) %in% "character"), - length(peak_quant_vars) >= 1 - ) - missing_peak_quant_vars <- setdiff( - peak_quant_vars, - colnames(mzroll_list$peaks) - ) - if (length(missing_peak_quant_vars) != 0) { - stop( - length(missing_peak_quant_vars), - " variables were missing from the samples table: ", - paste(missing_peak_quant_vars, collapse = ", ") - ) - } - - stopifnot(class(collapse_fxn) == "character", length(collapse_fxn) == 1) - - # reduce samples to fields which are defined unambiguously with grouping vars - - consistent_reduced_fields <- mzroll_list$samples %>% - dplyr::group_by(!!!syms(grouping_vars)) %>% - dplyr::summarize_all(n_unique) %>% - tidyr::gather(field, n_unique_records, -grouping_vars) %>% - dplyr::group_by(field) %>% - dplyr::filter(all(n_unique_records == 1)) %>% - dplyr::distinct(field) %>% - dplyr::ungroup() - - new_samples <- mzroll_list$samples %>% - dplyr::select(c(grouping_vars, consistent_reduced_fields$field)) %>% - dplyr::group_by(!!!syms(grouping_vars)) %>% - dplyr::slice(1) %>% - dplyr::ungroup() %>% - dplyr::mutate(sampleId = as.character(1:dplyr::n())) - - collapse_dict <- mzroll_list$samples %>% - dplyr::select(old_sampleId = sampleId, !!!syms(grouping_vars)) %>% - dplyr::left_join(new_samples %>% - dplyr::select(sampleId, grouping_vars), - by = grouping_vars - ) %>% - dplyr::select(old_sampleId, sampleId) - - mzroll_list$peaks <- mzroll_list$peaks %>% - dplyr::rename(old_sampleId = sampleId) %>% - dplyr::left_join(collapse_dict, by = "old_sampleId") %>% - dplyr::select(-old_sampleId) %>% - dplyr::group_by(groupId, sampleId) %>% - dplyr::summarize_each(funs = collapse_fxn, peak_quant_vars) %>% - dplyr::ungroup() + mzroll_list$measurements <- completed_peaks - mzroll_list$samples <- new_samples mzroll_list } -n_unique <- function(x) { - length(unique(x)) -} - #' Normalize Peaks #' #' @inheritParams test_mzroll_list @@ -148,46 +59,69 @@ n_unique <- function(x) { #' \code{.loess_fit} as a peaks variable in addition to #' \code{norm_peak_varname})} #' } -#' @param quant_peak_varname variable in peaks to use for abundance -#' @param norm_peak_varname variable in peaks to add for normalized abundance +#' @param quant_peak_varname variable in measurements to use for abundance +#' @param norm_peak_varname variable in measurements to add for normalized +#' abundance #' @param ... additional arguments to pass to normalization methods #' #' @return a \code{mzroll_list} as generated by \code{\link{process_mzroll}} -#' a \code{norm_peak_varname} variable added to peaks +#' a \code{norm_peak_varname} variable added to measuremnets #' #' @examples #' -#' mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) -#' #' normalize_peaks( -#' mzroll_list, +#' nplug_mzroll_augmented, #' "median polish", #' quant_peak_varname = "log2_abundance", -#' norm_peak_varname = "normalized_log2_IC" +#' norm_peak_varname = "normalized_log2_abundance" #' ) +#' #' normalize_peaks( -#' mzroll_list, +#' nplug_mzroll_augmented, #' "center batches", #' quant_peak_varname = "log2_abundance", -#' norm_peak_varname = "normalized_log2_IC", -#' batch_varnames = "sex", +#' norm_peak_varname = "normalized_log2_abundance", +#' batch_varnames = "month", #' centering_fxn = median #' ) +#' +#' normalize_peaks( +#' nplug_mzroll_augmented, +#' normalization_method = "reference condition", +#' quant_peak_varname = "log2_abundance", +#' norm_peak_varname = "normalized_log2_abundance", +#' condition_varname = "condition", +#' reference_varname = "reference" +#' ) +#' +#' normalize_peaks( +#' nplug_mzroll_augmented, +#' normalization_method = "reference sample", +#' quant_peak_varname = "log2_abundance", +#' norm_peak_varname = "normalized_log2_abundance", +#' batch_varnames = c("month", "extraction"), +#' reference_varname = "exp_ref", +#' reference_values = "ref" +#' ) +#' #' @export -normalize_peaks <- function(mzroll_list, - normalization_method, - quant_peak_varname = "log2_abundance", - norm_peak_varname = "normalized_log2_IC", - ...) { +normalize_peaks <- function( + mzroll_list, + normalization_method, + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_IC", + ... + ) { + dots <- list(...) test_mzroll_list(mzroll_list) checkmate::assertString(quant_peak_varname) - if (!(quant_peak_varname %in% colnames(mzroll_list$peaks))) { + if (!(quant_peak_varname %in% colnames(mzroll_list$measurements))) { stop( "\"quant_peak_varname\":", quant_peak_varname, - ", not present in peaks table" + ", not present in measurements table" ) } checkmate::assertString(norm_peak_varname) @@ -214,6 +148,15 @@ normalize_peaks <- function(mzroll_list, names(dots), names(formals(normalization_method_call)) )] + + unused_args <- setdiff(names(dots), names(formals(normalization_method_call))) + if (length(unused_args) > 0) { + warning(glue::glue( + "{length(unused_args)} passed arguments could not be used by + {normalization_method_call}: {paste(unused_args, collapse = ', ')} ") + ) + } + normalization_output <- do.call( normalization_method_call, append( @@ -229,33 +172,38 @@ normalize_peaks <- function(mzroll_list, normalization_output } +#' Normalize Peaks - Median Polish +#' #' Using a robust metabolomics normalization rescale ion counts based on the #' consensus signal of each sample. #' -#' This normalization was reported in Kamphorst et al. 2015, see +#' @details The robust median polish was reported in Kamphorst et al. 2015, see #' \url{https://github.com/shackett/Pancreatic_tumor_metabolomics}. #' #' @inheritParams normalize_peaks #' @inheritParams floor_peaks #' #' @rdname normalize_peaks -normalize_peaks.median_polish <- function(mzroll_list, - quant_peak_varname, - norm_peak_varname, - log2_floor_value = NA) { +normalize_peaks.median_polish <- function( + mzroll_list, + quant_peak_varname, + norm_peak_varname, + log2_floor_value = NA + ) { + stopifnot(length(log2_floor_value) == 1) if (!is.na(log2_floor_value)) { stopifnot(class(log2_floor_value) == "numeric") - normalization_peaks <- mzroll_list$peaks %>% + normalization_peaks <- mzroll_list$measurements %>% dplyr::filter( !!rlang::sym(quant_peak_varname) > log2_floor_value + 0.001 ) } else { - normalization_peaks <- mzroll_list$peaks + normalization_peaks <- mzroll_list$measurements } - if (any(is.na(mzroll_list$peaks[[quant_peak_varname]]))) { + if (any(is.na(mzroll_list$measurements[[quant_peak_varname]]))) { stop( "NAs are not allowed in ", quant_peak_varname, @@ -287,7 +235,7 @@ normalize_peaks.median_polish <- function(mzroll_list, } if (!is.na(log2_floor_value)) { - mzroll_list$peaks <- mzroll_list$peaks %>% + updated_measurements <- mzroll_list$measurements %>% dplyr::filter( !!rlang::sym(quant_peak_varname) > log2_floor_value + 0.001 ) %>% @@ -296,22 +244,24 @@ normalize_peaks.median_polish <- function(mzroll_list, !!rlang::sym(norm_peak_varname) := !!rlang::sym(quant_peak_varname) - scaling_factor ) %>% - # peaks starting at limit of detection are reset to log2_floor_value + # measurements starting at limit of detection are reset to + # log2_floor_value dplyr::bind_rows( - mzroll_list$peaks %>% + mzroll_list$measurements %>% dplyr::filter( !!rlang::sym(quant_peak_varname) <= log2_floor_value + 0.001 ) %>% dplyr::mutate(!!rlang::sym(norm_peak_varname) := log2_floor_value) ) %>% dplyr::select(-scaling_factor) %>% - # peaks pushed below limit of detection are reset to log2_floor_value + # measurements pushed below limit of detection are reset to + # log2_floor_value dplyr::mutate( !!rlang::sym(norm_peak_varname) := pmax(!!rlang::sym(norm_peak_varname), log2_floor_value) ) } else { - mzroll_list$peaks <- mzroll_list$peaks %>% + updated_measurements <- mzroll_list$measurements %>% dplyr::left_join(sample_scaling_factors, by = "sampleId") %>% dplyr::mutate( !!rlang::sym(norm_peak_varname) := @@ -320,9 +270,13 @@ normalize_peaks.median_polish <- function(mzroll_list, dplyr::select(-scaling_factor) } + mzroll_list <- romic::update_tomic(mzroll_list, updated_measurements) + return(mzroll_list) } +#' Normalize Peaks - Batch Center +#' #' @inheritParams normalize_peaks #' @inheritParams floor_peaks #' @param batch_varnames variables to use for grouping when removing batch @@ -330,12 +284,14 @@ normalize_peaks.median_polish <- function(mzroll_list, #' @param centering_fxn function to use when centering (mean, median, ...) #' #' @rdname normalize_peaks -normalize_peaks.batch_center <- function(mzroll_list, - quant_peak_varname, - norm_peak_varname, - batch_varnames, - log2_floor_value = NA, - centering_fxn = mean) { +normalize_peaks.batch_center <- function( + mzroll_list, + quant_peak_varname, + norm_peak_varname, + batch_varnames, + log2_floor_value = NA, + centering_fxn = mean + ) { checkmate::assertChoice(batch_varnames, colnames(mzroll_list$samples)) stopifnot(length(log2_floor_value) == 1) if (!is.na(log2_floor_value)) { @@ -343,7 +299,7 @@ normalize_peaks.batch_center <- function(mzroll_list, } checkmate::assertFunction(centering_fxn) - centered_peaks <- mzroll_list$peaks %>% + centered_peaks <- mzroll_list$measurements %>% dplyr::left_join(mzroll_list$samples %>% dplyr::select(sampleId, !!!rlang::syms(batch_varnames)), by = "sampleId" @@ -362,7 +318,7 @@ normalize_peaks.batch_center <- function(mzroll_list, !!rlang::sym(quant_peak_varname) - (.center - .group_center) ) %>% dplyr::select( - !!!rlang::syms(c(colnames(mzroll_list$peaks), norm_peak_varname)) + !!!rlang::syms(c(colnames(mzroll_list$measurements), norm_peak_varname)) ) centered_refloor <- normalization_refloor( @@ -372,12 +328,14 @@ normalize_peaks.batch_center <- function(mzroll_list, quant_peak_varname ) - mzroll_list$peaks <- centered_refloor - - mzroll_list + mzroll_list <- romic::update_tomic(mzroll_list, centered_refloor) + + return(mzroll_list) } -#' @details compare each sample to the reference samples within the same batch +#' Normalize Peaks - Reference Sample +#' +#' Compare each sample to the reference samples within the same batch #' #' @inheritParams normalize_peaks.batch_center #' @param reference_varname variable specifying which samples are references @@ -385,31 +343,22 @@ normalize_peaks.batch_center <- function(mzroll_list, #' reference samples #' #' @rdname normalize_peaks -normalize_peaks.reference_sample <- function(mzroll_list, - quant_peak_varname, - norm_peak_varname, - batch_varnames, - reference_varname = "sample", - reference_values = "posctl", - log2_floor_value = NA) { - checkmate::assertString(batch_varnames) - if (!(batch_varnames %in% colnames(mzroll_list$samples))) { - stop( - "\"batch_varnames\":", - batch_varnames, - ", not present in samples table" - ) - } - - checkmate::assertString(reference_varname) - if (!(reference_varname %in% colnames(mzroll_list$samples))) { - stop( - "\"reference_varname\":", - reference_varname, - ", not present in samples table" +normalize_peaks.reference_sample <- function( + mzroll_list, + quant_peak_varname, + norm_peak_varname, + batch_varnames, + reference_varname = "sample", + reference_values = "posctl", + log2_floor_value = NA + ) { + + purrr::walk( + batch_varnames, + checkmate::assertChoice, + colnames(mzroll_list$samples) ) - } - + checkmate::assertChoice(reference_varname, colnames(mzroll_list$samples)) checkmate::assertCharacter(reference_values) stopifnot(length(log2_floor_value) == 1) @@ -417,7 +366,7 @@ normalize_peaks.reference_sample <- function(mzroll_list, stopifnot(class(log2_floor_value) == "numeric") } - annotated_peaks <- mzroll_list$peaks %>% + annotated_peaks <- mzroll_list$measurements %>% dplyr::left_join(mzroll_list$samples %>% dplyr::select( sampleId, @@ -431,10 +380,23 @@ normalize_peaks.reference_sample <- function(mzroll_list, !!rlang::quo(!!rlang::sym(reference_varname) %in% reference_values) ) %>% dplyr::group_by(groupId, !!!rlang::syms(batch_varnames)) %>% - dplyr::summarize(.reference = median(!!rlang::sym(quant_peak_varname))) %>% - dplyr::group_by(groupId) %>% - dplyr::mutate(.reference_diff = .reference - mean(.reference)) + dplyr::summarize( + .reference = median(!!rlang::sym(quant_peak_varname)), + .groups = "drop" + ) + if (is.na(log2_floor_value)) { + # use the relative scale + reference_peaks <- reference_peaks %>% + dplyr::mutate(.reference_diff = .reference) + } else { + # retain the original scale + reference_peaks <- reference_peaks %>% + dplyr::group_by(groupId) %>% + dplyr::mutate(.reference_diff = .reference - mean(.reference)) %>% + dplyr::ungroup() + } + normalized_peaks <- annotated_peaks %>% dplyr::left_join(reference_peaks, by = c("groupId", batch_varnames)) %>% dplyr::mutate( @@ -442,7 +404,7 @@ normalize_peaks.reference_sample <- function(mzroll_list, !!rlang::sym(quant_peak_varname) - .reference_diff ) %>% dplyr::select( - !!!rlang::syms(c(colnames(mzroll_list$peaks), norm_peak_varname)) + !!!rlang::syms(c(colnames(mzroll_list$measurements), norm_peak_varname)) ) # ensure that normalized values still respect the limit of detection @@ -453,12 +415,14 @@ normalize_peaks.reference_sample <- function(mzroll_list, quant_peak_varname ) - mzroll_list$peaks <- reference_refloor + mzroll_list <- romic::update_tomic(mzroll_list, reference_refloor) return(mzroll_list) } -#' @details compare each sample to the reference samples within the same batch +#' Normalize Peaks - Reference Condition +#' +#' Compare each sample to the reference samples within the same batch #' #' @inheritParams normalize_peaks.batch_center #' @param condition_varname variable specifying each sample's condition @@ -469,11 +433,14 @@ normalize_peaks.reference_sample <- function(mzroll_list, #' (using the condition values from condition_varname) #' #' @rdname normalize_peaks -normalize_peaks.reference_condition <- function(mzroll_list, - quant_peak_varname, - norm_peak_varname, - condition_varname = "condition #", - reference_varname = "reference condition #") { +normalize_peaks.reference_condition <- function( + mzroll_list, + quant_peak_varname, + norm_peak_varname, + condition_varname = "condition #", + reference_varname = "reference condition #", + log2_floor_value = NA + ) { # check for valid inputs @@ -517,9 +484,14 @@ normalize_peaks.reference_condition <- function(mzroll_list, ) } + stopifnot(length(log2_floor_value) == 1) + if (!is.na(log2_floor_value)) { + stopifnot(class(log2_floor_value) == "numeric") + } + # summarize reference conditions - annotated_peaks <- mzroll_list$peaks %>% + annotated_peaks <- mzroll_list$measurements %>% dplyr::left_join(mzroll_list$samples %>% dplyr::select( sampleId, @@ -529,15 +501,30 @@ normalize_peaks.reference_condition <- function(mzroll_list, ) # estimate log abundance for each reference condition per group - references <- annotated_peaks %>% + reference_peaks <- annotated_peaks %>% dplyr::filter( !!rlang::sym(condition_varname) %in% unique(mzroll_list$samples[[reference_varname]]) ) %>% dplyr::group_by(groupId, !!rlang::sym(condition_varname)) %>% - dplyr::summarize(.reference = mean(!!rlang::sym(quant_peak_varname))) %>% + dplyr::summarize( + .reference = median(!!rlang::sym(quant_peak_varname)), + .groups = "drop" + ) %>% dplyr::ungroup() + if (is.na(log2_floor_value)) { + # use the relative scale + reference_peaks <- reference_peaks %>% + dplyr::mutate(.reference_diff = .reference) + } else { + # retain the original scale + reference_peaks <- reference_peaks %>% + dplyr::group_by(groupId) %>% + dplyr::mutate(.reference_diff = .reference - mean(.reference)) %>% + dplyr::ungroup() + } + join_by <- rlang::set_names( rlang::quo_name(condition_varname), rlang::quo_name(reference_varname) @@ -545,30 +532,40 @@ normalize_peaks.reference_condition <- function(mzroll_list, normalized_peaks <- annotated_peaks %>% # add reference to each peak - dplyr::left_join(references, by = c("groupId", join_by)) %>% + dplyr::left_join(reference_peaks, by = c("groupId", join_by)) %>% dplyr::mutate( !!rlang::sym(norm_peak_varname) := - !!rlang::sym(quant_peak_varname) - .reference + !!rlang::sym(quant_peak_varname) - .reference_diff ) %>% dplyr::select( - !!!rlang::syms(c(colnames(mzroll_list$peaks), norm_peak_varname)) + !!!rlang::syms(c(colnames(mzroll_list$measurements), norm_peak_varname)) ) - mzroll_list$peaks <- normalized_peaks + # ensure that normalized values still respect the limit of detection + reference_refloor <- normalization_refloor( + normalized_peaks, + log2_floor_value, + norm_peak_varname, + quant_peak_varname + ) + + mzroll_list <- romic::update_tomic(mzroll_list, reference_refloor) return(mzroll_list) } #' @inheritParams normalize_peaks.batch_center #' @param weights_tribble a table containing weights and sample variables to -#' match them to +#' match them to. #' #' @rdname normalize_peaks -normalize_peaks.loess <- function(mzroll_list, - quant_peak_varname, - norm_peak_varname, - weights_tribble = NULL, - log2_floor_value = 12) { +normalize_peaks.loess <- function( + mzroll_list, + quant_peak_varname, + norm_peak_varname, + weights_tribble = NULL, + log2_floor_value = 12 + ) { stopifnot("datetime" %in% colnames(mzroll_list$samples)) stopifnot(length(log2_floor_value) == 1) @@ -601,7 +598,7 @@ normalize_peaks.loess <- function(mzroll_list, dplyr::mutate(weights = 1) } - loess_fits <- mzroll_list$peaks %>% + loess_fits <- mzroll_list$measurements %>% dplyr::left_join(sample_weights, by = "sampleId") %>% tidyr::nest(groupData = -groupId) %>% dplyr::mutate(loess_fits = purrr::map( @@ -613,7 +610,7 @@ normalize_peaks.loess <- function(mzroll_list, dplyr::select(-groupData) %>% tidyr::unnest(loess_fits) %>% dplyr::select(!!!rlang::syms(c( - colnames(mzroll_list$peaks), + colnames(mzroll_list$measurements), ".loess_fit", norm_peak_varname ))) @@ -624,8 +621,8 @@ normalize_peaks.loess <- function(mzroll_list, quant_peak_varname ) - mzroll_list$peaks <- loess_floor - + mzroll_list <- romic::update_tomic(mzroll_list, loess_floor) + return(mzroll_list) } @@ -640,7 +637,9 @@ fit_loess <- function(groupData, quant_peak_varname, norm_peak_varname) { # weights should be inversely proportional to variance loess_model <- stats::loess( - formula = stats::as.formula(glue::glue("{quant_peak_varname} ~ hours_elapsed")), + formula = stats::as.formula( + glue::glue("{quant_peak_varname} ~ hours_elapsed") + ), data = loess_data, weights = weights ) @@ -661,10 +660,12 @@ fit_loess <- function(groupData, quant_peak_varname, norm_peak_varname) { return(loess_apply) } -normalization_refloor <- function(normalized_peaks, - log2_floor_value, - norm_peak_varname, - quant_peak_varname) { +normalization_refloor <- function( + normalized_peaks, + log2_floor_value, + norm_peak_varname, + quant_peak_varname + ) { # If a floor value is supplied floor a peak to this value if it reached the # floor either pre- or post-normalization @@ -681,7 +682,6 @@ normalization_refloor <- function(normalized_peaks, log2_floor_value + 0.001 ~ log2_floor_value, !!rlang::sym(norm_peak_varname) <= log2_floor_value + 0.001 ~ log2_floor_value, - log2_floor_value + 0.001 ~ log2_floor_value, TRUE ~ !!rlang::sym(norm_peak_varname) )) } diff --git a/R/pathway_enrichment.R b/R/pathway_enrichment.R index bb60f8e..3674281 100644 --- a/R/pathway_enrichment.R +++ b/R/pathway_enrichment.R @@ -1,9 +1,11 @@ -#' Pathway Enrichments +#' Find Pathway Enrichments #' #' @inheritParams test_mzroll_list -#' @param significance returned by \code{\link{diffex_mzroll}}; a tibble of -#' tests performed. -#' @inheritParams process_mzroll +#' @inheritParams plot_volcano +#' @param pathway_list a named list where names are pathways and entries +#' are groupIds belonging to each pathway +#' @param ranking_measure variable in \code{regression_significance} to use +#' when calculating enrichment. #' @param test_absolute_effects If TRUE then only consider the magnitude of #' test-statistics when calculating rankings for enrichment; if FALSE then #' consider sign as well. @@ -16,68 +18,60 @@ #' significant enrichments for each term} #' } #' +#' @examples +#' +#' library(dplyr) +#' +#' regression_significance <- diffex_mzroll( +#' nplug_mzroll_normalized, +#' "normalized_log2_abundance", +#' "limitation + limitation:DR + 0" +#' ) +#' pathway_nest <- nplug_mzroll_normalized$features %>% +#' dplyr::select(groupId, pathway) %>% +#' tidyr::nest(pathway_members = groupId) +#' +#' pathway_list <- purrr::map( +#' pathway_nest$pathway_members, +#' function(x) { +#' as.character(x$groupId) +#' } +#' ) +#' names(pathway_list) <- pathway_nest$pathway +#' +#' find_pathway_enrichments( +#' mzroll_list = nplug_mzroll_normalized, +#' regression_significance, +#' pathway_list, +#' test_absolute_effects = FALSE +#' ) +#' #' @export -pathway_enrichments <- function(mzroll_list, - significance, - standard_databases, - test_absolute_effects = TRUE) { +find_pathway_enrichments <- function( + mzroll_list, + regression_significance, + pathway_list, + ranking_measure = "statistic", + test_absolute_effects = TRUE + ) { + test_mzroll_list(mzroll_list) - - if (!("data.frame" %in% class(significance))) { - stop( - "\"significance\" must be a table of tests performed - as generated by diffex_mzroll()" - ) - } - - if (!("list" %in% class(standard_databases))) { - stop( - "\"standard_databases\" must be a list as generated - by configure_db_access()" - ) - } - - checkmate::assertLogical(test_absolute_effects, 1) - - # find all compound annotations - systematic_compounds <- query_systematic_compounds( - systematic_compounds_con = standard_databases$systematic_compounds_con - ) - - # format pathways as a list containing the metabolite members - pathway_nest <- mzroll_list$peakgroups %>% - dplyr::filter(!is.na(systematicCompoundId)) %>% - dplyr::inner_join( - systematic_compounds$metabolic_pathways, - by = "systematicCompoundId" - ) %>% - dplyr::select( - groupId, - systematicCompoundId, - compoundName, - databaseId, - pathwayId, - pathwayName - ) %>% - tidyr::nest(pathway_entries = c(-pathwayId, -pathwayName)) %>% - dplyr::filter(purrr::map_int(pathway_entries, nrow) >= 4) - - pathway_list <- purrr::map( - pathway_nest$pathway_entries, - function(x) { - as.character(x$groupId) - } - ) - names(pathway_list) <- pathway_nest$pathwayName - + + checkmate::assertDataFrame(regression_significance) + checkmate::assertList(pathway_list, names = "unique") + checkmate::assertChoice(ranking_measure, colnames(regression_significance)) + checkmate::assertLogical(test_absolute_effects, len = 1) + + regression_significance <- regression_significance %>% + dplyr::mutate(ranking_measure = !!rlang::sym(ranking_measure)) + if (test_absolute_effects) { - significance <- significance %>% - dplyr::mutate(statistic = abs(statistic)) + regression_significance <- regression_significance %>% + dplyr::mutate(ranking_measure = abs(ranking_measure)) } - enrichment_by_term <- mzroll_list$peakgroups %>% - dplyr::filter(!is.na(systematicCompoundId)) %>% - dplyr::inner_join(significance, by = "groupId") %>% + enrichment_by_term <- mzroll_list$features %>% + dplyr::inner_join(regression_significance, by = "groupId") %>% tidyr::nest(term_data = -term) %>% dplyr::mutate(gsea_results = purrr::map( term_data, @@ -99,28 +93,35 @@ pathway_enrichments <- function(mzroll_list, tidyr::unnest(fgsea_results) %>% dplyr::select(-leadingEdge) + pathway_nest <- purrr::map2( + names(pathway_list), + unname(pathway_list), + function(x,y){ + tibble::tibble(pathway = x) %>% + dplyr::mutate(pathway_members = list(tibble::tibble(groupId = y))) + }) %>% + dplyr::bind_rows() + # Issue 569: add back peak group and compound data, removed by fgsea::fgsea() enrichment_table_full <- dplyr::inner_join( enrichment_table, pathway_nest, - by = c("pathway" = "pathwayName") - ) %>% + by = "pathway" + ) %>% dplyr::select( term, - pathwayId, pathway, - pathway_entries, + pathway_members, pval, padj, ES, NES, - nMoreExtreme, size, enrichment_plot ) enrichment_table_full_expanded <- enrichment_table_full %>% - tidyr::unnest(cols = c(pathway_entries)) + tidyr::unnest(cols = c(pathway_members)) pathway_enrichments_output <- list( enrichment_table = enrichment_table_full, @@ -142,42 +143,61 @@ pathway_enrichments <- function(mzroll_list, #' #' @return a list containing a summary plot for the term and tibble #' containing data and plots for all comparisons performed. -calculate_pathway_enrichment <- function(term_data, - pathway_list, - padj_cutoff = 0.2) { - stopifnot( - "numeric" %in% class(padj_cutoff), - length(padj_cutoff) == 1, - padj_cutoff > 0, - padj_cutoff <= 1 - ) - - ranked_coefs <- term_data$statistic +calculate_pathway_enrichment <- function( + term_data, + pathway_list, + padj_cutoff = 0.2 + ) { + + checkmate::assertDataFrame(term_data) + checkmate::assertList(pathway_list, names = "unique") + checkmate::assertNumber(padj_cutoff, lower = 0, upper = 1) + + ranked_coefs <- term_data$ranking_measure names(ranked_coefs) <- as.character(term_data$groupId) - sorted_ranked_coefs <- sort(ranked_coefs, decreasing = TRUE) + # choose an appropriate score type based on whether coefs can be negative + score_type <- ifelse(all(sorted_ranked_coefs >= 0), "pos", "std") + fgsea_results <- fgsea::fgsea( pathway_list, sorted_ranked_coefs, - nperm = 10000 + scoreType = score_type ) + # generate an enrichment plot for each pathway - fgsea_results$enrichment_plot <- purrr::map( - pathway_list, - fgsea::plotEnrichment, - ranked_coefs - ) + enrichment_plots <- tibble::tibble( + pathway = names(pathway_list), + members = unname(pathway_list) + ) %>% + dplyr::mutate(enrichment_plot = purrr::map( + members, + fgsea::plotEnrichment, + ranked_coefs + )) %>% + dplyr::select(-members) + + fgsea_results <- fgsea_results %>% + dplyr::left_join(enrichment_plots, by = "pathway") # generate a plot summarizing the largest pathway changes - gsea_table_grob <- fgsea::plotGseaTable( - pathway_list[fgsea_results$padj < padj_cutoff], - ranked_coefs, - fgsea_results, - gseaParam = 0.5, - render = FALSE - ) - + reported_pathways <- fgsea_results %>% + dplyr::filter(padj < padj_cutoff) %>% + dplyr::arrange(padj) + + if (nrow(reported_pathways) == 0) { + gsea_table_grob <- NULL + } else { + gsea_table_grob <- fgsea::plotGseaTable( + pathway_list[reported_pathways$pathway], + ranked_coefs, + fgsea_results, + gseaParam = 0.5, + render = FALSE + ) + } + list( fgsea_results = fgsea_results, gsea_table_grob = list(plot = gsea_table_grob) diff --git a/R/sample_metadata.R b/R/sample_metadata.R index 384ab7b..f813269 100644 --- a/R/sample_metadata.R +++ b/R/sample_metadata.R @@ -1,198 +1,142 @@ -#' Find Tracking Sheet +#' Merge Samples Table #' -#' Locate the meta data tracking sheet for the project -#' -#' @inheritParams stringr::str_detect -#' -#' @return tracking_sheet_id: if a unique tracking sheet is found, then -#' return the googlesheets id -#' -#' @examples -#' \dontrun{ -#' find_tracking_sheet(pattern = "X0083") -#' } +#' Merge a table of sample metadata with an existing mzroll_list #' +#' @inheritParams test_mzroll_list +#' @param samples_tbl Table of sample metadata +#' @param id_strings one or more variables which will be used to match +#' sample names. +#' @param exact if true, an exact match between mzroll names and id_strings +#' will be found; if false, then substring matches will be used. +#' +#' @examples +#' mzroll_list <- process_mzroll(nplug_mzroll()) +#' merge_samples_tbl(mzroll_list, nplug_samples, "sample_name") +#' #' @export -find_tracking_sheet <- function(pattern) { - - # search in Metabolomics Core > Experiments > Project ID - experiments <- googledrive::drive_ls( - googledrive::as_id("0B6szsSC6az3YN0dhZDZ4bzFLM00") - ) - - experiments_folder <- experiments %>% - dplyr::filter(stringr::str_detect(name, pattern)) - - if (nrow(experiments_folder) > 1) { +merge_samples_tbl <- function( + mzroll_list, + samples_tbl, + id_strings, + exact = TRUE + ) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertDataFrame(samples_tbl) + + if ("samples_tbl_row" %in% colnames(samples_tbl)) { stop( - nrow(reduced_tracking_sheets), - ' experiments folders matched "pattern": ', - paste(reduced_tracking_sheets$name, collapse = ", ") - ) - } else if (nrow(experiments_folder) == 1) { - # check experiment folder for .gsheet - experiment_tracking_sheet <- googledrive::as_id(experiments_folder$id) %>% - googledrive::drive_ls() %>% - # filter to just googlesheets with relevant worksheets - dplyr::mutate(is_tracking_sheet = purrr::map_lgl( - id, - identify_tracking_sheet - )) %>% - dplyr::filter(is_tracking_sheet) - - if (nrow(experiment_tracking_sheet) > 1) { - # too many sheets found - stop( - nrow(experiment_tracking_sheet), - ' tracking sheets found in folder matching "pattern": ', - paste(reduced_tracking_sheets$name, collapse = ", ") - ) - } else if (nrow(experiment_tracking_sheet) == 1) { - # return ID uniquely found in folder - return(experiment_tracking_sheet$id) - } else { - # continue looking for sheets - message( - "No experiments folder matching pattern in Experiments folder - checking in sample sheets" + "\"samples_tbl_row\" is a reserved variable in samples_tbl, + do not include it" ) } + + checkmate::assertCharacter(id_strings) + purrr::walk(id_strings, checkmate::assertChoice, colnames(samples_tbl)) + checkmate::assertLogical(exact) + + # match samples_tbl to mzroll samples + + samples_tbl <- samples_tbl %>% + dplyr::mutate(samples_tbl_row = 1:dplyr::n()) + + # read all id_strings - substrings used to match sample names in mzroll + + ms_id_strings <- samples_tbl %>% + dplyr::select(!!!rlang::syms(c("samples_tbl_row", id_strings))) %>% + dplyr::mutate_at(dplyr::vars(-samples_tbl_row), as.character) %>% + tidyr::gather(id_string_var, id_string_value, -samples_tbl_row) %>% + dplyr::filter(!is.na(id_string_value)) + + sample_ms_id_string_matches <- mzroll_list$samples %>% + tidyr::crossing(ms_id_strings) + + if (exact) { + sample_ms_id_string_matches <- sample_ms_id_string_matches %>% + dplyr::filter(name == id_string_value) } else { - # continue looking for sheets - message( - "No experiments folder matching pattern in Experiments folder, - checking in sample sheets" - ) + sample_ms_id_string_matches <- sample_ms_id_string_matches %>% + dplyr::filter(stringr::str_detect(name, id_string_value)) } - - # search in Metabolomics Core > Experimental Tracking Spreadsheets - tracking_sheets <- googledrive::drive_ls( - googledrive::as_id("1PSk5g-sYkwT5jMTOQRO9BUozoruAJJGI") - ) - - reduced_tracking_sheets <- tracking_sheets %>% - dplyr::filter(stringr::str_detect(name, pattern)) - - if (nrow(reduced_tracking_sheets) != 1) { - stop( - nrow(reduced_tracking_sheets), - ' tracking sheets matched "pattern :"', - paste(reduced_tracking_sheets$name, collapse = ", ") - ) - } else { - reduced_tracking_sheets$id - } -} - -identify_tracking_sheet <- function(id) { - id_sheets <- try(googlesheets4::sheets_sheets(id), silent = TRUE) - - if (class(id_sheets) == "try-error") { - return(FALSE) - } - - if (all( - c("USER SUBMISSION", "USER SAMPLE LIST", "Metabolomics User Sample List") - %in% id_sheets - )) { - TRUE - } else { - FALSE - } -} - -#' Read Sample List -#' -#' @param tracking_sheet_id output of \link{find_tracking_sheet} -read_sample_list <- function(tracking_sheet_id) { - sample_list <- googlesheets4::read_sheet( - tracking_sheet_id, - sheet = "Metabolomics User Sample List" - ) - - # require a few standard fields in the Metabolomics User Sample List - required_sample_list_vars <- c( - "tube label", - "sample description", - "condition #", - "reference condition #", - "MS ID string", - "MS ID string alternative" - ) - missing_required_vars <- setdiff( - required_sample_list_vars, - colnames(sample_list) - ) - if (length(missing_required_vars) != 0) { - stop( - "\"Metabolomics User Sample List\" is missing required variables: ", - paste(missing_required_vars, collapse = ", ") - ) - } - - # check for tube label uniqueness - - duplicated_tube_labels <- sample_list %>% - dplyr::count(`tube label`) %>% + + # check whether 1 sample matches 2+ IDs + + sample_multimatch <- sample_ms_id_string_matches %>% + dplyr::count(name) %>% dplyr::filter(n > 1) - - if (nrow(duplicated_tube_labels) != 0) { + + if (nrow(sample_multimatch) != 0) { + multimatch_details <- sample_ms_id_string_matches %>% + dplyr::semi_join(sample_multimatch, by = "name") %>% + dplyr::arrange(name) %>% + dplyr::mutate(out_string = glue::glue( + "name: {name} matching row: {samples_tbl_row}, id string column: {id_string_var}, id string entry: {id_string_value}" + )) %>% + { + paste(.$out_string, collapse = "\n") + } + stop( - nrow(duplicated_tube_labels), - " tube labels were not unique, duplicated labels: ", - paste(duplicated_tube_labels$`tube label`, collapse = ", ") + nrow(sample_multimatch), + " experimental samples matched multiple ID strings: ", + paste(sample_multimatch$name, collapse = ", "), + "\nDetails:\n", + multimatch_details ) } - - # check for uniqueness of every MS ID string and alternatives - - duplicated_id_strings <- sample_list %>% - dplyr::select(`MS ID string`, `MS ID string alternative`) %>% - tidyr::gather("ID string column", "ID string entry") %>% - dplyr::filter(!is.na(`ID string entry`)) %>% - dplyr::count(`ID string entry`) %>% + + # check whether 1 ID matches 2+ samples + + condition_multimatch <- sample_ms_id_string_matches %>% + dplyr::count(samples_tbl_row) %>% dplyr::filter(n > 1) - - if (nrow(duplicated_id_strings) != 0) { + + if (nrow(condition_multimatch) != 0) { + multimatch_details <- sample_ms_id_string_matches %>% + dplyr::semi_join(condition_multimatch, by = "samples_tbl_row") %>% + dplyr::arrange(samples_tbl_row) %>% + dplyr::mutate(out_string = glue::glue( + "row: {samples_tbl_row} matching name_match: {name} with id string column: {id_string_var}, id string entry: {id_string_value}" + )) %>% + { + paste(.$out_string, collapse = "\n") + } + stop( - nrow(duplicated_id_strings), - " MS ID strings were not unique: ", - paste(duplicated_id_strings$`ID string entry`, collapse = ", ") + nrow(condition_multimatch), + " rows in sample_tbl matched multiple experimental samples: ", + paste(condition_multimatch$samples_tbl_row, collapse = ", "), + "\nDetails:\n", + multimatch_details ) } - - # check that all reference conditions exist - - if ( - !all(sample_list$`reference condition #` %in% sample_list$`condition #`) - ) { - stop("some reference condition #s not found as condition #s") + + matched_augmented_samples <- mzroll_list$samples %>% + dplyr::inner_join( + sample_ms_id_string_matches %>% + dplyr::select(sampleId, samples_tbl_row) %>% + dplyr::left_join(samples_tbl, by = "samples_tbl_row"), + by = "sampleId" + ) + + unmatched_samples <- mzroll_list$samples %>% + dplyr::anti_join(sample_ms_id_string_matches, by = "sampleId") + + if (nrow(unmatched_samples) != 0) { + warning( + nrow(unmatched_samples), + " experimental samples were not matched to ID strings. Their measurements will be discarded.:\n ", + paste(unmatched_samples$name, collapse = "\n ") + ) } - - sample_list -} - -#' Import Sample List -#' -#' Find, read and process a sample meta sheet from googlesheets -#' -#' @inheritParams find_tracking_sheet -#' -#' @examples -#' \dontrun{ -#' import_sample_sheet(pattern = "X0083") -#' } -#' -#' @export -import_sample_sheet <- function(pattern) { - tracking_sheet_id <- find_tracking_sheet(pattern = pattern) - - sample_sheet <- read_sample_list(tracking_sheet_id) - - list( - tracking_sheet_id = tracking_sheet_id, - sample_sheet = sample_sheet - ) + + mzroll_list <- romic::update_tomic( + mzroll_list, + matched_augmented_samples + ) + + return(mzroll_list) } #' Remove Constant Name @@ -267,3 +211,246 @@ tibble_name <- function(name) { r_position = dplyr::n():1 ) } + +#' Generate relevant lipid components from compound name +#' +#' Produce data frame with lipid components from name +#' +#' @param compound_name vector of compound names +#' +#' @return compound_name vector expanded to large table, with structural +#' components described as their own columns. +#' +#' @export +lipid_components <- function(compound_name) { + compound_components <- tibble::tibble("compoundName" = compound_name) %>% + # general compound information + dplyr::mutate( + lipidClass = stringr::str_extract(compoundName, "^.*(?=\\()") + ) %>% + dplyr::mutate( + compound_name_adduct = stringr::str_extract(compoundName, "(?<=\\) ).*$") + ) %>% + dplyr::mutate( + plasmalogen_type = stringr::str_extract(compoundName, "[op]-(?=[0-9]+:)") + ) %>% + dplyr::mutate(lipidClass_o_p = ifelse( + is.na(plasmalogen_type), + lipidClass, + paste0(plasmalogen_type, lipidClass) + )) %>% + dplyr::mutate(lipidClass_plas = ifelse( + is.na(plasmalogen_type), + lipidClass, + paste0("plas-", lipidClass) + )) %>% + # Parse all chains. Also retrieves number of hydroxyl groups on each + # FA (m=1, d=2, t=3), if available + dplyr::mutate(sn_chains = stringr::str_extract_all( + compoundName, + "(?<=[\\(/_])[A-Za-z]?-?[0-9]+:[0-9]+;?O?[0-9]?(?=[\\)/_])" + )) %>% + dplyr::mutate(num_sn_chains = sapply(sn_chains, function(x) { + length(x) + })) %>% + dplyr::mutate(num_unique_sn_chains = sapply(sn_chains, function(x) { + x %>% + unique() %>% + length() + })) %>% + # sn1 + dplyr::mutate(sn1 = sapply(sn_chains, function(x) { + x[1] + })) %>% + dplyr::mutate(FA1 = stringr::str_extract(sn1, "[0-9]+:[0-9]+")) %>% + dplyr::mutate(single_1 = as.numeric( + stringr::str_extract(FA1, ".*(?=:)") + )) %>% + dplyr::mutate(single_1 = ifelse(is.na(single_1), 0, single_1)) %>% + dplyr::mutate(double_1 = as.numeric( + stringr::str_extract(FA1, "(?<=:).*") + )) %>% + dplyr::mutate(double_1 = ifelse(is.na(double_1), 0, double_1)) %>% + dplyr::mutate(double_1 = ifelse( + (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), + double_1 + 1, + double_1 + )) %>% + dplyr::mutate(OH_1 = dplyr::case_when( + grepl("^m", sn1) ~ 1, + grepl("^d", sn1) ~ 2, + grepl("^t", sn1) ~ 3, + grepl(";O4", sn1) ~ 4, + grepl(";O3", sn1) ~ 3, + grepl(";O2", sn1) ~ 2, + grepl(";O1", sn1) ~ 1, + grepl(";O", sn1) ~ 1, + TRUE ~ 0 + )) %>% + # sn2 + dplyr::mutate(sn2 = sapply(sn_chains, function(x) { + x[2] + })) %>% + dplyr::mutate(FA2 = stringr::str_extract(sn2, "[0-9]+:[0-9]+")) %>% + dplyr::mutate(single_2 = as.numeric( + stringr::str_extract(FA2, ".*(?=:)") + )) %>% + dplyr::mutate(single_2 = ifelse(is.na(single_2), 0, single_2)) %>% + dplyr::mutate(double_2 = as.numeric( + stringr::str_extract(FA2, "(?<=:).*") + )) %>% + dplyr::mutate(double_2 = ifelse(is.na(double_2), 0, double_2)) %>% + dplyr::mutate(OH_2 = dplyr::case_when( + grepl("^m", sn2) ~ 1, + grepl("^d", sn2) ~ 2, + grepl("^t", sn2) ~ 3, + grepl(";O4", sn2) ~ 4, + grepl(";O3", sn2) ~ 3, + grepl(";O2", sn2) ~ 2, + grepl(";O1", sn2) ~ 1, + grepl(";O", sn2) ~ 1, + TRUE ~ 0 + )) %>% + # sn3 (for TGs) + dplyr::mutate(sn3 = sapply(sn_chains, function(x) { + x[3] + })) %>% + dplyr::mutate(FA3 = stringr::str_extract(sn3, "[0-9]+:[0-9]+")) %>% + dplyr::mutate(single_3 = as.numeric( + stringr::str_extract(FA3, ".*(?=:)") + )) %>% + dplyr::mutate(single_3 = ifelse(is.na(single_3), 0, single_3)) %>% + dplyr::mutate(double_3 = as.numeric( + stringr::str_extract(FA3, "(?<=:).*") + )) %>% + dplyr::mutate(double_3 = ifelse(is.na(double_3), 0, double_3)) %>% + dplyr::mutate(OH_3 = dplyr::case_when( + grepl("^m", sn3) ~ 1, + grepl("^d", sn3) ~ 2, + grepl("^t", sn3) ~ 3, + grepl(";O4", sn3) ~ 4, + grepl(";O3", sn3) ~ 3, + grepl(";O2", sn3) ~ 2, + grepl(";O1", sn3) ~ 1, + grepl(";O", sn3) ~ 1, + TRUE ~ 0 + )) %>% + # sn4 (for Cardiolipins CLs) + dplyr::mutate(sn4 = sapply(sn_chains, function(x) { + x[4] + })) %>% + dplyr::mutate(FA4 = stringr::str_extract(sn4, "[0-9]+:[0-9]+")) %>% + dplyr::mutate(single_4 = as.numeric( + stringr::str_extract(FA4, ".*(?=:)") + )) %>% + dplyr::mutate(single_4 = ifelse(is.na(single_4), 0, single_4)) %>% + dplyr::mutate(double_4 = as.numeric( + stringr::str_extract(FA4, "(?<=:).*") + )) %>% + dplyr::mutate(double_4 = ifelse(is.na(double_4), 0, double_4)) %>% + dplyr::mutate(OH_4 = dplyr::case_when( + grepl("^m", sn4) ~ 1, + grepl("^d", sn4) ~ 2, + grepl("^t", sn4) ~ 3, + grepl(";O4", sn4) ~ 4, + grepl(";O3", sn4) ~ 3, + grepl(";O2", sn4) ~ 2, + grepl(";O1", sn4) ~ 1, + grepl(";O", sn4) ~ 1, + TRUE ~ 0 + )) %>% + # finally, put together all info from chains to describe summed + # composition. sometimes, the compound is already reported as a summed + # composition. In that case, return the original name. + dplyr::mutate(total_single = ifelse( + num_sn_chains == 0, + as.numeric(stringr::str_extract( + compoundName, + "(?<=\\()[A-Za-z]?-?[0-9]+(?=:[0-9]+,)" + )), + (single_1 + single_2 + single_3 + single_4) + )) %>% + dplyr::mutate(total_single = ifelse( + is.na(total_single), + 0, + total_single + )) %>% + dplyr::mutate(total_double = ifelse( + num_sn_chains == 0, + as.numeric(stringr::str_extract(compoundName, "(?<=:)[0-9]+(?=,)")), + (double_1 + double_2 + double_3 + double_4) + )) %>% + dplyr::mutate(total_double = ifelse( + is.na(total_double), + 0, + total_double + )) %>% + dplyr::mutate(total_OH = ifelse( + num_sn_chains == 0, + as.numeric(stringr::str_extract(compoundName, "(?<=,)[0-9]+(?=-)")), + OH_1 + OH_2 + OH_3 + OH_4 + )) %>% + dplyr::mutate(total_OH = ifelse(is.na(total_OH), 0, total_OH)) %>% + dplyr::mutate(sumComposition = dplyr::case_when( + num_sn_chains > 0 & total_OH == 0 ~ + glue::glue("{class}({plas}{single}:{double})", + class = lipidClass, + plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), + single = total_single, + double = ifelse( + (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), + (total_double - 1), + total_double + ) + ), + num_sn_chains > 0 & total_OH == 1 ~ + glue::glue( + "{class}({plas}{single}:{double};O)", + class = lipidClass, + plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), + single = total_single, + double = ifelse( + (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), + (total_double - 1), + total_double + ) + ), + num_sn_chains > 0 & total_OH > 1 ~ + glue::glue( + "{class}({plas}{single}:{double};O{num_OH})", + class = lipidClass, + plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), + single = total_single, + double = ifelse( + (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), + (total_double - 1), + total_double + ), + num_OH = total_OH + ), + num_sn_chains == 0 ~ compoundName + )) %>% + dplyr::mutate(etherPlasmalogenCompoundName = ifelse( + is.na(plasmalogen_type), + compoundName, + glue::glue( + "{class}({single}:{double}e/{sn2_chain})", + class = lipidClass, + single = single_1, + double = double_1, + sn2_chain = FA2 + ) + )) %>% + dplyr::mutate(etherPlasmalogenSumComposition = ifelse( + is.na(plasmalogen_type), + sumComposition, + glue::glue( + "{class}({single}:{double}e)", + class = lipidClass, + single = total_single, + double = total_double + ) + )) + + compound_components +} diff --git a/R/systematic_db_query.R b/R/systematic_db_query.R deleted file mode 100644 index 3d73e66..0000000 --- a/R/systematic_db_query.R +++ /dev/null @@ -1,137 +0,0 @@ -#' Query Systematic Compounds -#' -#' @param systematicCompoundIds NULL to query all compounds or a numeric -#' vector of systematic compound IDs -#' @param systematic_compounds_con A connection to an SQL database which -#' stores the structure and systematic IDs of well-studied compounds. -#' -#' @return a list containing: -#' \itemize{ -#' \item{distinct_compounds: unique database entries} -#' \item{database_ids: systematic IDs of compounds in KEGG, HMDB, ...} -#' \item{compound_aliases: alternative names for a compound} -#' \item{metabolic_pathways: KEGG pathways associated with a metabolite} -#' } -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' systematic_compounds_con <- configure_db_access()$systematic_compounds_con -#' systematic_compounds <- query_systematic_compounds( -#' systematic_compounds_con = systematic_compounds_con -#' ) -#' } -query_systematic_compounds <- function(systematicCompoundIds = NULL, - systematic_compounds_con) { - stopifnot(all( - class(systematicCompoundIds) %in% c("NULL", "numeric", "integer") - )) - - distinct_compounds <- dplyr::tbl( - systematic_compounds_con, - "distinct_compounds" - ) %>% - dplyr::collect() - database_ids <- dplyr::tbl( - systematic_compounds_con, - "database_ids" - ) %>% - dplyr::collect() - compound_aliases <- dplyr::tbl( - systematic_compounds_con, - "compound_aliases" - ) %>% - dplyr::collect() - metabolic_pathways <- dplyr::tbl( - systematic_compounds_con, - "metabolic_pathways" - ) %>% - dplyr::collect() - - if (is.null(systematicCompoundIds[1])) { - systematicCompoundIds <- distinct_compounds$compoundId - } - - list( - distinct_compounds = distinct_compounds, - database_ids = database_ids, - compound_aliases = compound_aliases, - metabolic_pathways = metabolic_pathways - ) %>% - purrr::map( - .f = function(a_tbl, ids) { - a_tbl %>% - dplyr::rename(systematicCompoundId = compoundId) %>% - dplyr::filter(systematicCompoundId %in% ids) - }, - ids = systematicCompoundIds - ) -} - -#' Summarize compound pathways -#' -#' Reduce compound pathway annotations to a set of core "focus" pathways and -#' to well-represented (>= \code{min_pw_size} examples) general pathways. -#' -#' @param systematic_compounds The output of -#' \code{\link{query_systematic_compounds}}. -#' @param min_pw_size Minimum number of examples of metabolites from a pathway -#' for the pathway not to be filtered. -#' @param focus_pathways A character vector of names of KEGG pathways to be -#' included as \code{focus_pathway} annotations. -#' -#' @return a tibble of systematicCompoundIds and pathway annotations -#' -#' @export -summarize_compound_pathways <- function(systematic_compounds, - min_pw_size = 5L, - focus_pathways = c( - "Glycolysis / Gluconeogenesis", - "Citrate cycle (TCA cycle)", - "Pentose phosphate pathway", - "Biosynthesis of amino acids", - "Purine metabolism", - "Pyrimidine metabolism" - )) { - checkmate::assertNumber(min_pw_size, lower = 0) - checkmate::assertCharacter(focus_pathways) - - represented_pathways <- systematic_compounds$metabolic_pathways %>% - dplyr::count(pathwayName) %>% - dplyr::mutate(is_focus_pathway = ifelse( - pathwayName %in% focus_pathways, - TRUE, - FALSE - )) %>% - dplyr::filter(n >= min_pw_size | is_focus_pathway) - - # label compounds in focus pathway with most specific annotation - - focus_annotations <- systematic_compounds$metabolic_pathways %>% - dplyr::inner_join(represented_pathways %>% - dplyr::filter(is_focus_pathway), - by = "pathwayName" - ) %>% - dplyr::group_by(systematicCompoundId) %>% - dplyr::arrange(n) %>% - dplyr::slice(1) %>% - dplyr::select(systematicCompoundId, focus_pathway = pathwayName) %>% - dplyr::ungroup() - - general_annotations <- systematic_compounds$metabolic_pathways %>% - dplyr::inner_join(represented_pathways, by = "pathwayName") %>% - dplyr::group_by(systematicCompoundId) %>% - dplyr::arrange(n) %>% - dplyr::slice(1) %>% - dplyr::select(systematicCompoundId, pathway = pathwayName) %>% - dplyr::ungroup() - - general_annotations %>% - dplyr::left_join(focus_annotations, by = "systematicCompoundId") %>% - dplyr::mutate(focus_pathway = ifelse( - is.na(focus_pathway), - "Other", - focus_pathway - )) -} diff --git a/R/utils.R b/R/utils.R index 28563b0..b53265c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,599 +1,139 @@ -#' \code{calicomics} package -#' -#' Calicomics can read .mzrollDB files created using MAVEN or Quahog into an -#' mzroll_list. These lists can then be modified through normalization, -#' signal flooring, and filtering. Differential abundance analysis of -#' metabolites can then be performed along with a variety of visualizations. -#' -#' @docType package -#' @name calicomics -#' @importFrom dplyr %>% -#' @import ggplot2 -NULL - -## quiets concerns of R CMD check re: the .'s that appear in pipelines -utils::globalVariables(c( - ".", - ":=", - ".center", - ".group_center", - ".loess_fit", - ".loess_shift", - ".reference", - ".reference_diff", - "abund_mean", - "abund_se", - "abundance", - "anova_signif", - "centered_log2_abundance", - "char", - "compoundId", - "compoundName", - "databaseId", - "datetime", - "diff_to_median", - "displayName", - "double_1", - "double_2", - "double_3", - "double_4", - "enrichment_plot", - "ES", - "FA1", - "FA2", - "FA3", - "FA4", - "fdr_summary", - "field", - "filename", - "focus_pathway", - "fgsea_results", - "groupData", - "groupId", - "gsea_results", - "gsea_table_grob", - "id", - "ID string entry", - "lm_fits", - "is_discovery", - "is_focus_pathway", - "is_tracking_sheet", - "is_unknown", - "label", - "leadingEdge", - "lipidClass", - "log2_abundance", - "median", - "median_abund", - "method_tag", - "MS ID string", - "MS ID string alternative", - "mz_delta", - "mz_median", - "mzmax", - "mzmax_adj", - "mzmin", - "mzmin_adj", - "mzroll_db_path", - "n", - "n_entries", - "n_unique_records", - "name", - "name_tibble", - "NES", - "new_compoundName", - "newSampleId", - "nMoreExtreme", - "num_sn_chains", - "OH_1", - "OH_2", - "OH_3", - "OH_4", - "old_sampleId", - "ordered_groupId", - "ordered_sampleId", - "one_peak_data", - "p.value", - "p.value.trans", - "padj", - "pathway", - "pathway_entries", - "pathwayId", - "pathwayName", - "peak_label", - "peakAreaTop", - "peakMz", - "peakId", - "plasmalogen_type", - "position", - "processed_mzroll", - "pval", - "qvalue", - "r_position", - "rt", - "rt_delta", - "rt_median", - "rtmax", - "rtmax_adj", - "rtmin", - "rtmin_adj", - "sampleId", - "sample description", - "samples", - "scaling_factor", - "single_1", - "single_2", - "single_3", - "single_4", - "size", - "sn1", - "sn2", - "sn3", - "sn4", - "sn_chains", - "statistic", - "sumComposition", - "systematicCompoundId", - "term", - "term_data", - "total_double", - "total_OH", - "total_single", - "tube", - "tube label", - "variable", - "weights" -)) - -#' Configure Database Access -#' -#' Setup and test connection to Calico databases -#' -#' @param standards_db type of standards database to connect to: -#' \itemize{ -#' \item{standard: just standards} -#' \item{extended: standards, external libraries and predicted spectra} -#' } -#' -#' @return a list containing connections to the standards and systematic -#' compounds database -#' -#' @export -configure_db_access <- function(standards_db = "standard") { - - # load metabolite MySQL db password - db_password <- Sys.getenv("metabolite_db") - if (db_password == "") { - warning("metabolite_db password missing from .Renviron - please add it. i.e., metabolite_db=xxxxx") - } - - standard_dbs <- tibble::tribble( - ~name, ~db_name, - "standard", "mass_spec_standards", - "extended", "mass_spec_standards_extended" - ) - - if (any(class(standards_db) != "character") || length(standards_db) != 1) { - stop("\"standards_db\" must be a length 1 character vector - valid entries are: ", paste(standard_dbs$name, collapse = ", ")) - } - - if (!(standards_db %in% standard_dbs$name)) { - stop( - standards_db, - " is not a valid entry for \"standards_db\" valid entries are: ", - paste(standard_dbs$name, collapse = ", ") - ) - } - standard_db_name <- standard_dbs$db_name[standard_dbs$name == standards_db] - - # connect to metabolite MySQL databases - # (this is behind the firewall) - mass_spec_standards_con <- DBI::dbConnect( - RMySQL::MySQL(), - user = "admin", - password = db_password, - dbname = standard_db_name, - host = "104.196.252.153" - ) - - systematic_compounds_con <- DBI::dbConnect( - RMySQL::MySQL(), - user = "admin", - password = db_password, - dbname = "systematic_compounds", - host = "104.196.252.153" - ) - - # present connections - list( - mass_spec_standards_con = mass_spec_standards_con, - systematic_compounds_con = systematic_compounds_con - ) -} - -#' Test Mzroll List +#' Test MzRoll List #' #' @param mzroll_list output of \link{process_mzroll} or #' \link{process_mzroll_multi} #' #' \itemize{ -#' \item{peakgroups: one row per unique analyte (defined by a +#' \item{features: one row per unique analyte (defined by a #' unique groupId)}, #' \item{samples: one row per unique sample (defined by a unique sampleId)}, -#' \item{peaks: one row per peak (samples x peakgroups)} +#' \item{measurements: one row per peak (samples x peakgroups)} #' } #' -test_mzroll_list <- function(mzroll_list) { - if (!("list" %in% class(mzroll_list))) { - stop("\"mzroll_list\" must be a list") - } - - required_tables <- c("peakgroups", "samples", "peaks") - provided_tables <- names(mzroll_list) - - missing_required_tables <- setdiff(required_tables, provided_tables) - - if (length(missing_required_tables) != 0) { - stop( - "missing required tables: ", - paste(missing_required_tables, collapse = ", "), - "; generate mzroll_list with \"process_mzroll\"" - ) - } - extra_provided_tables <- setdiff(provided_tables, required_tables) - if (length(missing_required_tables) != 0) { - warning( - "extra tables present in mzroll_list: ", - paste(extra_provided_tables, collapse = ", ") +#' @inheritParams romic:::check_triple_omic +#' +test_mzroll_list <- function(mzroll_list, fast_check = TRUE) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + + # check that mzroll_list is a valid tomic + + romic::check_tomic(mzroll_list, fast_check) + + # check for claman-specific conventions + + if (mzroll_list$design$feature_pk != "groupId") { + stop(glue::glue( + "The mzroll feature primary key was {mzroll_list$design$feature_pk} + this value must be \"groupId\"")) + } + + if (mzroll_list$design$sample_pk != "sampleId") { + stop(glue::glue( + "The mzroll feature primary key was {mzroll_list$design$sample_pk} + this value must be \"sampleId\"")) + } + + # check for required variables + + check_required_variables( + mzroll_list, + "features", + c( + "groupId", + "compoundName", + "smiles", + "adductName", + "tagString", + "mz", + "rt", + "compoundDB", + "searchTableName", + "label" + ) ) - } - - # this overlaps with some functions in clamr/clamdb - might want to pull - # them out as general utils - required_fields <- tibble::tribble( - ~tbl, ~variable, - "peaks", "groupId", - "peaks", "sampleId", - "peakgroups", "groupId", - "samples", "sampleId" + + check_required_variables( + mzroll_list, + "measurements", + c("groupId", "sampleId", "log2_abundance") ) - - included_variables <- tibble::tibble(tbl = names(mzroll_list)) %>% - dplyr::mutate(variable = purrr::map(mzroll_list, colnames)) %>% - tidyr::unnest(variable) - - absent_required_fields <- required_fields %>% - dplyr::anti_join(included_variables, by = c("tbl", "variable")) %>% - dplyr::mutate(message = glue::glue( - "required variable missing from {tbl}: {variable}" - )) - - if (nrow(absent_required_fields) != 0) { - stop(paste(absent_required_fields$message, collapse = "\n")) - } + + # the only required field is sampleId, other field will likely + # be discarded as samples are merged during normalization + check_required_variables(mzroll_list, "samples", c("sampleId", "name")) + + # check for invalid variables + + checkmate::assertClass(mzroll_list[["features"]]$groupId, "factor") + checkmate::assertClass(mzroll_list[["samples"]]$sampleId, "factor") + + unnamed_samples <- mzroll_list$samples %>% dplyr::filter(is.na(name)) + if (nrow(unnamed_samples) > 0){ + stop(glue::glue( + "{nrow(unnamed_samples)} samples were unnamed. All samples must be named" + )) + } + + duplicated_names <- mzroll_list$samples %>% + dplyr::group_by(name) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::distinct(name) + + if (nrow(duplicated_names) > 0){ + stop(glue::glue("{nrow(duplicated_names)} sample names were duplicated")) + } + + return(invisible(0)) + } -#' Paths for X0106 examples -#' -#' methods and paths to X0106 example mzroll datasets. -#' -#' @examples -#' \dontrun{ -#' X0106_paths <- tibble::tribble( -#' ~method_tag, ~mzroll_db_path, -#' "metabolites (neg)", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M001-peakdetector.mzrollDB", -#' "metabolites (pos) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M002-peakdetector.mzrollDB", -#' "metabolites (pos) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M002-peakdetector2.mzrollDB", -#' "lipids (neg) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-1.mzrollDB", -#' "lipids (neg) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2.mzrollDB", -#' "lipids (neg) - 3", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2b-AA-BDP-CL.mzrollDB", -#' "lipids (neg) - 4", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2b-AA-BDP-CL+Stds.mzrollDB", -#' "lipids (pos) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M005A-peakdetector.mzrollDB", -#' "lipids (pos) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M005A-peakdetectorMGDGTGStds.mzrollDB" -#' ) -#' -#' usethis::use_data(X0106_paths, overwrite = TRUE) -#' } -"X0106_paths" - -#' Flatten mzroll list to single table -#' -#' Flatten mzroll list to single table using groupId and sampleId -#' -#' @param mzroll_list output of process_mzrollDB functions -#' \itemize{ -#' \item{peakgroups: one row per unique analyte (defined by a unique -#' groupId)}, -#' \item{samples: one row per unique sample (defined by a unique sampleId)}, -#' \item{peaks: one row per peak (samples x peakgroups)} -#' } -#' -#' @return mzroll_list (process_mzrollDB functions output) reformatted as a -#' single table -#' -#' @export -mzroll_table <- function(mzroll_list) { - - # guards - if (class(mzroll_list) != "list") { - stop("input \"mzroll_list\" must be a list.") - } - if (length(mzroll_list) != 3) { - stop("input \"mzroll_list\" must contain exactly 3 entries.") - } - if (names(mzroll_list[1]) != "peakgroups") { - stop("input \"mzroll_list\" first element must be named \"peakgroups\"") - } - if (names(mzroll_list[2]) != "samples") { - stop("input \"mzroll_list\" second element must be named \"samples\"") - } - if (names(mzroll_list[3]) != "peaks") { - stop("input \"mzroll_list\" third element must be named \"peaks\"") - } - if (!"groupId" %in% colnames(mzroll_list[[1]])) { - stop("peakgroups table missing required column \"groupId\"") - } - if (!"groupId" %in% colnames(mzroll_list[[3]])) { - stop("peaks table missing required column \"groupId\"") - } - if (!"sampleId" %in% colnames(mzroll_list[[2]])) { - stop("samples table missing required column \"sampleId\"") - } - if (!"sampleId" %in% colnames(mzroll_list[[3]])) { - stop("peaks table missing required column \"sampleId\"") - } - - mzroll_table <- dplyr::inner_join( - mzroll_list$peakgroups, - mzroll_list$peaks, - by = c("groupId") - ) %>% - dplyr::inner_join(mzroll_list$samples, by = c("sampleId")) +check_required_variables <- function(mzroll_list, table, required_variables) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + checkmate::assertChoice(table, c("features", "samples", "measurements")) + checkmate::assertCharacter(required_variables) - mzroll_table + missing_measurements <- setdiff( + required_variables, + mzroll_list$design[[table]]$variable + ) + + if (length(missing_measurements) != 0) { + stop(glue::glue( + "required variable(s) {paste(missing_measurements, collapse = ', ')} + missing from {table} + ")) + } + + return (invisible(0)) } -#' Generate relevant lipid components from compound name -#' -#' Produce data frame with lipid components from name -#' -#' @param compound_name vector of compound names -#' -#' @return compound_name vector expanded to large table, with structural -#' components described as their own columns. -#' -#' @export -lipid_components <- function(compound_name) { - compound_components <- tibble::tibble("compoundName" = compound_name) %>% - # general compound information - dplyr::mutate( - lipidClass = stringr::str_extract(compoundName, "^.*(?=\\()") - ) %>% - dplyr::mutate( - compound_name_adduct = stringr::str_extract(compoundName, "(?<=\\) ).*$") - ) %>% - dplyr::mutate( - plasmalogen_type = stringr::str_extract(compoundName, "[op]-(?=[0-9]+:)") - ) %>% - dplyr::mutate(lipidClass_o_p = ifelse( - is.na(plasmalogen_type), - lipidClass, - paste0(plasmalogen_type, lipidClass) - )) %>% - dplyr::mutate(lipidClass_plas = ifelse( - is.na(plasmalogen_type), - lipidClass, - paste0("plas-", lipidClass) - )) %>% - # Parse all chains. Also retrieves number of hydroxyl groups on each - # FA (m=1, d=2, t=3), if available - dplyr::mutate(sn_chains = stringr::str_extract_all( - compoundName, - "(?<=[\\(/_])[A-Za-z]?-?[0-9]+:[0-9]+;?O?[0-9]?(?=[\\)/_])" - )) %>% - dplyr::mutate(num_sn_chains = sapply(sn_chains, function(x) { - length(x) - })) %>% - dplyr::mutate(num_unique_sn_chains = sapply(sn_chains, function(x) { - x %>% - unique() %>% - length() - })) %>% - # sn1 - dplyr::mutate(sn1 = sapply(sn_chains, function(x) { - x[1] - })) %>% - dplyr::mutate(FA1 = stringr::str_extract(sn1, "[0-9]+:[0-9]+")) %>% - dplyr::mutate(single_1 = as.numeric( - stringr::str_extract(FA1, ".*(?=:)") - )) %>% - dplyr::mutate(single_1 = ifelse(is.na(single_1), 0, single_1)) %>% - dplyr::mutate(double_1 = as.numeric( - stringr::str_extract(FA1, "(?<=:).*") - )) %>% - dplyr::mutate(double_1 = ifelse(is.na(double_1), 0, double_1)) %>% - dplyr::mutate(double_1 = ifelse( - (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), - double_1 + 1, - double_1 - )) %>% - dplyr::mutate(OH_1 = dplyr::case_when( - grepl("^m", sn1) ~ 1, - grepl("^d", sn1) ~ 2, - grepl("^t", sn1) ~ 3, - grepl(";O4", sn1) ~ 4, - grepl(";O3", sn1) ~ 3, - grepl(";O2", sn1) ~ 2, - grepl(";O1", sn1) ~ 1, - grepl(";O", sn1) ~ 1, - TRUE ~ 0 - )) %>% - # sn2 - dplyr::mutate(sn2 = sapply(sn_chains, function(x) { - x[2] - })) %>% - dplyr::mutate(FA2 = stringr::str_extract(sn2, "[0-9]+:[0-9]+")) %>% - dplyr::mutate(single_2 = as.numeric( - stringr::str_extract(FA2, ".*(?=:)") - )) %>% - dplyr::mutate(single_2 = ifelse(is.na(single_2), 0, single_2)) %>% - dplyr::mutate(double_2 = as.numeric( - stringr::str_extract(FA2, "(?<=:).*") - )) %>% - dplyr::mutate(double_2 = ifelse(is.na(double_2), 0, double_2)) %>% - dplyr::mutate(OH_2 = dplyr::case_when( - grepl("^m", sn2) ~ 1, - grepl("^d", sn2) ~ 2, - grepl("^t", sn2) ~ 3, - grepl(";O4", sn2) ~ 4, - grepl(";O3", sn2) ~ 3, - grepl(";O2", sn2) ~ 2, - grepl(";O1", sn2) ~ 1, - grepl(";O", sn2) ~ 1, - TRUE ~ 0 - )) %>% - # sn3 (for TGs) - dplyr::mutate(sn3 = sapply(sn_chains, function(x) { - x[3] - })) %>% - dplyr::mutate(FA3 = stringr::str_extract(sn3, "[0-9]+:[0-9]+")) %>% - dplyr::mutate(single_3 = as.numeric( - stringr::str_extract(FA3, ".*(?=:)") - )) %>% - dplyr::mutate(single_3 = ifelse(is.na(single_3), 0, single_3)) %>% - dplyr::mutate(double_3 = as.numeric( - stringr::str_extract(FA3, "(?<=:).*") - )) %>% - dplyr::mutate(double_3 = ifelse(is.na(double_3), 0, double_3)) %>% - dplyr::mutate(OH_3 = dplyr::case_when( - grepl("^m", sn3) ~ 1, - grepl("^d", sn3) ~ 2, - grepl("^t", sn3) ~ 3, - grepl(";O4", sn3) ~ 4, - grepl(";O3", sn3) ~ 3, - grepl(";O2", sn3) ~ 2, - grepl(";O1", sn3) ~ 1, - grepl(";O", sn3) ~ 1, - TRUE ~ 0 - )) %>% - # sn4 (for Cardiolipins CLs) - dplyr::mutate(sn4 = sapply(sn_chains, function(x) { - x[4] - })) %>% - dplyr::mutate(FA4 = stringr::str_extract(sn4, "[0-9]+:[0-9]+")) %>% - dplyr::mutate(single_4 = as.numeric( - stringr::str_extract(FA4, ".*(?=:)") - )) %>% - dplyr::mutate(single_4 = ifelse(is.na(single_4), 0, single_4)) %>% - dplyr::mutate(double_4 = as.numeric( - stringr::str_extract(FA4, "(?<=:).*") - )) %>% - dplyr::mutate(double_4 = ifelse(is.na(double_4), 0, double_4)) %>% - dplyr::mutate(OH_4 = dplyr::case_when( - grepl("^m", sn4) ~ 1, - grepl("^d", sn4) ~ 2, - grepl("^t", sn4) ~ 3, - grepl(";O4", sn4) ~ 4, - grepl(";O3", sn4) ~ 3, - grepl(";O2", sn4) ~ 2, - grepl(";O1", sn4) ~ 1, - grepl(";O", sn4) ~ 1, - TRUE ~ 0 - )) %>% - # finally, put together all info from chains to describe summed - # composition. sometimes, the compound is already reported as a summed - # composition. In that case, return the original name. - dplyr::mutate(total_single = ifelse( - num_sn_chains == 0, - as.numeric(stringr::str_extract( - compoundName, - "(?<=\\()[A-Za-z]?-?[0-9]+(?=:[0-9]+,)" - )), - (single_1 + single_2 + single_3 + single_4) - )) %>% - dplyr::mutate(total_single = ifelse( - is.na(total_single), - 0, - total_single - )) %>% - dplyr::mutate(total_double = ifelse( - num_sn_chains == 0, - as.numeric(stringr::str_extract(compoundName, "(?<=:)[0-9]+(?=,)")), - (double_1 + double_2 + double_3 + double_4) - )) %>% - dplyr::mutate(total_double = ifelse( - is.na(total_double), - 0, - total_double - )) %>% - dplyr::mutate(total_OH = ifelse( - num_sn_chains == 0, - as.numeric(stringr::str_extract(compoundName, "(?<=,)[0-9]+(?=-)")), - OH_1 + OH_2 + OH_3 + OH_4 - )) %>% - dplyr::mutate(total_OH = ifelse(is.na(total_OH), 0, total_OH)) %>% - dplyr::mutate(sumComposition = dplyr::case_when( - num_sn_chains > 0 & total_OH == 0 ~ - glue::glue("{class}({plas}{single}:{double})", - class = lipidClass, - plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), - single = total_single, - double = ifelse( - (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), - (total_double - 1), - total_double - ) - ), - num_sn_chains > 0 & total_OH == 1 ~ - glue::glue( - "{class}({plas}{single}:{double};O)", - class = lipidClass, - plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), - single = total_single, - double = ifelse( - (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), - (total_double - 1), - total_double - ) - ), - num_sn_chains > 0 & total_OH > 1 ~ - glue::glue( - "{class}({plas}{single}:{double};O{num_OH})", - class = lipidClass, - plas = ifelse(is.na(plasmalogen_type), "", plasmalogen_type), - single = total_single, - double = ifelse( - (!is.na(plasmalogen_type) & plasmalogen_type == "p-"), - (total_double - 1), - total_double - ), - num_OH = total_OH - ), - num_sn_chains == 0 ~ compoundName - )) %>% - dplyr::mutate(etherPlasmalogenCompoundName = ifelse( - is.na(plasmalogen_type), - compoundName, - glue::glue( - "{class}({single}:{double}e/{sn2_chain})", - class = lipidClass, - single = single_1, - double = double_1, - sn2_chain = FA2 - ) - )) %>% - dplyr::mutate(etherPlasmalogenSumComposition = ifelse( - is.na(plasmalogen_type), - sumComposition, - glue::glue( - "{class}({single}:{double}e)", - class = lipidClass, - single = total_single, - double = total_double - ) - )) - compound_components +#' Util - Pretty Knitr Head +#' +#' @param tbl a data.frame or tibble +#' @param nrows the max number of rows to show +#' @inheritParams knitr::kable +#' +#' @return an html knitr table +#' +#' @export +#' +#' @examples +#' util_pretty_khead(mtcars, nrows = 5, caption = "cars!") +util_pretty_khead <- function(tbl, nrows = 10, caption = NULL) { + + checkmate::assertDataFrame(tbl) + checkmate::assertNumber(nrows, lower = 1) + + tbl %>% + dplyr::slice(1:nrows) %>% + knitr::kable(caption = caption) %>% + kableExtra::kable_styling( + position = "left", + bootstrap_options = "striped" + ) } diff --git a/R/visualizations.R b/R/visualizations.R index 0cef54b..23c8e3b 100644 --- a/R/visualizations.R +++ b/R/visualizations.R @@ -1,371 +1,95 @@ -#' Omics Heatmap -#' -#' Generate a basic heatmap of all of your metabolomic and lipidomic data +#' Plot Barplots #' #' @inheritParams test_mzroll_list -#' @param feature.var variable from "peakgroups" to use as a unique feature -#' label. -#' @param sample.var variable from "samples" to use as a unique sample label. -#' @param value.var which variable in "peaks" to use for quantification. -#' @param cluster_dim dimensions to cluster (row, column, or both) -#' @param change_threshold values with a more extreme absolute change will be -#' thresholded to this value. -#' @param plot_type plotly (for interactivity) or grob (for a static ggplot) -#' -#' @examples -#' library(dplyr) -#' mzroll_list <- authutils::get_clamr_assets("X0083-M001A.mzrollDB") %>% -#' process_mzroll( -#' standard_databases = NULL, -#' method_tag = "M001A", -#' only_identified = TRUE -#' ) %>% -#' floor_peaks(12) -#' -#' plot_heatmap( -#' mzroll_list, -#' sample.var = "sampleId", -#' feature.var = "peak_label", -#' change_threshold = 5, -#' cluster_dim = "rows", -#' plot_type = "grob" -#' ) -#' \dontrun{ -#' plot_heatmap( -#' mzroll_list, -#' sample.var = "sampleId", -#' feature.var = "peak_label", -#' change_threshold = 5, -#' cluster_dim = "rows" -#' ) -#' } -#' -#' @export -plot_heatmap <- function(mzroll_list, - feature.var = "groupId", - sample.var = "name", - value.var = "centered_log2_abundance", - cluster_dim = "both", - change_threshold = 3, - plot_type = "plotly") { - test_mzroll_list(mzroll_list) - - checkmate::assertString(feature.var) - checkmate::assertString(sample.var) - checkmate::assertString(value.var) - - stopifnot(feature.var %in% colnames(mzroll_list$peakgroups)) - stopifnot(sample.var %in% colnames(mzroll_list$samples)) - stopifnot(value.var %in% colnames(mzroll_list$peaks)) - - stopifnot( - class(cluster_dim) == "character", - cluster_dim %in% c("rows", "columns", "both") - ) - stopifnot( - class(change_threshold) %in% c("numeric", "integer"), - length(change_threshold) == 1, - change_threshold > 0 - ) - stopifnot( - class(plot_type) == "character", - length(plot_type) == 1, - plot_type %in% c("plotly", "grob") - ) - - # add additional labels - - augmented_peaks <- mzroll_list$peaks %>% - # add labels if needed - dplyr::left_join(mzroll_list$samples %>% - dplyr::select(!!quo(c("sampleId", sample.var))), - by = "sampleId" - ) %>% - dplyr::left_join(mzroll_list$peakgroups %>% - dplyr::select(!!quo(c("groupId", feature.var))), - by = "groupId" - ) - - # format list to matrix - - # convert groupId and sampleId to factors so they are ordered appropriately - - cluster_orders <- hclust_order( - mzroll_list$peaks, "groupId", "sampleId", - value.var, cluster_dim - ) - - # save classes of sampleId and groupId so appropriate class coercion occurs - class(cluster_orders$rows) <- class(mzroll_list$peakgroups$groupId) - class(cluster_orders$columns) <- class(mzroll_list$samples$sampleId) - - # order rows and columns - - distinct_features <- augmented_peaks %>% - dplyr::distinct(groupId, !!rlang::sym(feature.var)) - - if (cluster_dim == "columns") { - - # order by factor or alpha-numerically - - if (class(distinct_features[[feature.var]]) %in% c("factor", "ordered")) { - # retain previous ordering - - ordered_distinct_features <- distinct_features %>% - dplyr::arrange(!!rlang::sym(feature.var)) - } else { - ordered_distinct_features <- distinct_features %>% - dplyr::arrange(!!rlang::sym(feature.var)) - } - - ordered_distinct_features <- ordered_distinct_features %>% - dplyr::mutate(ordered_groupId = factor(groupId, levels = groupId)) - } else { - - # order with hclust - - ordered_distinct_features <- distinct_features %>% - dplyr::left_join(tibble::tibble(groupId = cluster_orders$rows) %>% - dplyr::mutate(order = 1:dplyr::n()), - by = "groupId" - ) %>% - dplyr::arrange(order) %>% - dplyr::mutate(ordered_groupId = factor(groupId, levels = groupId)) - } - - distinct_samples <- augmented_peaks %>% - dplyr::distinct(sampleId, !!rlang::sym(sample.var)) - - if (cluster_dim == "rows") { - - # order by factor or alpha-numerically - - if (class(distinct_samples[[sample.var]]) %in% c("factor", "ordered")) { - # retain previous ordering - - ordered_distinct_samples <- distinct_samples %>% - dplyr::arrange(!!rlang::sym(sample.var)) - } else { - ordered_distinct_samples <- distinct_samples %>% - dplyr::arrange(!!rlang::sym(sample.var)) - } - - ordered_distinct_samples <- ordered_distinct_samples %>% - dplyr::mutate(ordered_sampleId = factor(sampleId, levels = sampleId)) - } else { - - # order with hclust - - ordered_distinct_samples <- distinct_samples %>% - dplyr::left_join(tibble::tibble(sampleId = cluster_orders$columns) %>% - dplyr::mutate(order = 1:dplyr::n()), - by = "sampleId" - ) %>% - dplyr::arrange(order) %>% - dplyr::mutate(ordered_sampleId = factor(sampleId, levels = sampleId)) - } - - # update labels - ordered_distinct_features <- ordered_distinct_features %>% - dplyr::mutate(label = stringr::str_wrap( - as.character(!!rlang::sym(feature.var)), - width = 40, - exdent = 2 - )) - - ordered_distinct_samples <- ordered_distinct_samples %>% - dplyr::mutate(label = stringr::str_wrap( - as.character(!!rlang::sym(sample.var)), - width = 40, - exdent = 2 - )) - - # setup abundance values - ordered_augmented_peaks <- augmented_peaks %>% - # order all rows and columns - dplyr::left_join(ordered_distinct_features %>% - dplyr::select(groupId, ordered_groupId), - by = "groupId" - ) %>% - dplyr::left_join(ordered_distinct_samples %>% - dplyr::select(sampleId, ordered_sampleId), - by = "sampleId" - ) %>% - # threshold max - dplyr::mutate(abundance = pmax( - pmin( - !!rlang::sym(value.var), - change_threshold - ), - -1 * change_threshold - )) - - heatmap_theme <- theme_minimal() + - theme( - text = element_text(size = 16, color = "black"), - title = element_text(size = 20, color = "black"), - axis.text.x = element_text(angle = 60, hjust = 1), - axis.text.y = element_text(size = 4), - axis.title.y = element_blank(), - strip.text = element_text(size = 18), - legend.position = "top", - strip.background = element_rect(fill = "gray80") - ) - - heatmap_plot <- ggplot( - ordered_augmented_peaks, - aes(x = ordered_sampleId, y = ordered_groupId, fill = abundance) - ) + - geom_tile() + - scale_fill_gradient2( - expression(log[2] ~ abundance), - low = "steelblue1", - mid = "black", - high = "yellow", - midpoint = 0 - ) + - scale_x_discrete(sample.var, - breaks = ordered_distinct_samples$ordered_sampleId, - labels = ordered_distinct_samples$label - ) + - scale_y_discrete( - breaks = ordered_distinct_features$ordered_groupId, - labels = ordered_distinct_features$label, - position = "right" - ) + - expand_limits(fill = c(-1 * change_threshold, change_threshold)) + - heatmap_theme - - if (plot_type == "grob") { - return(heatmap_plot) - } else if (plot_type == "plotly") { - suppressWarnings( - plotly::ggplotly(heatmap_plot) %>% - plotly::layout(margin = 0) - ) - } else { - stop("undefined plotting type logic") - } -} - -#' Hierarchical clustering order -#' -#' Format and hierarchically cluster a data.frame. If hclust could not -#' normally be produced (usually because no samples are in common for a -#' feature) pad the matrix with zeros and still calculate the distance -#' -#' @param df data.frame to cluster -#' @param feature_pk variable uniquely defining a row -#' @param sample_pk variable uniquely definining a sample -#' @param measurement_var measurement variables -#' @param cluster_dim rows, columns, or both +#' @param groupIds groupIds for compounds to plot +#' @param grouping_vars variables to use for ordering samples and calculating +#' standard errors +#' @inheritParams diffex_mzroll +#' @param fill_var variable to use for filling bars #' -#' @return a list containing a hierarchically clustered set of rows -#' and/or columns +#' @examples +#' +#' plot_barplot( +#' mzroll_list = nplug_mzroll_normalized, +#' groupIds = 1:4, +#' grouping_vars = c("limitation", "DR"), +#' value_var = "normalized_log2_abundance", +#' fill_var = "limitation" +#' ) #' #' @export -hclust_order <- function(df, - feature_pk, - sample_pk, - measurement_var, - cluster_dim) { - stopifnot("data.frame" %in% class(df)) - stopifnot(length(feature_pk) == 1, feature_pk %in% colnames(df)) - stopifnot(length(sample_pk) == 1, sample_pk %in% colnames(df)) - stopifnot(length(measurement_var) == 1, measurement_var %in% colnames(df)) - stopifnot( - length(cluster_dim) == 1, - cluster_dim %in% c("rows", "columns", "both") - ) - - quant_matrix <- df %>% - dplyr::select_(.dots = stats::setNames( - list(feature_pk, sample_pk, measurement_var), - c("feature_id", "sample_id", "quant") - )) %>% - reshape2::acast(feature_id ~ sample_id, value.var = "quant") - - output <- list() - - if (cluster_dim %in% c("rows", "both")) { - cluster_rows <- try( - stats::hclust(stats::dist(quant_matrix), method = "ward.D2"), - silent = TRUE - ) - - # if distance cannot be computed (because of missing values) pad with - # zeros and recalculate - if (class(cluster_rows) == "try-error") { - pad_matrix <- matrix(0, ncol = 2, nrow = nrow(quant_matrix)) - colnames(pad_matrix) <- c("pad1", "pad2") - quant_matrix_pad <- cbind(quant_matrix, pad_matrix) - - cluster_rows <- stats::hclust( - stats::dist(quant_matrix_pad), - method = "ward.D2" - ) - } - - output$rows <- rownames(quant_matrix)[cluster_rows$order] - } - - if (cluster_dim %in% c("columns", "both")) { - cluster_cols <- try( - stats::hclust(stats::dist(t(quant_matrix)), method = "ward.D2"), - silent = TRUE +plot_barplot <- function( + mzroll_list, + groupIds, + grouping_vars, + value_var, + fill_var = NULL + ) { + + checkmate::assertClass(mzroll_list, "tomic") + checkmate::assertClass(mzroll_list, "mzroll") + stopifnot(class(groupIds) %in% c("numeric", "integer", "factor", "ordered")) + checkmate::assertCharacter(grouping_vars) + purrr::walk( + grouping_vars, + checkmate::assertChoice, + mzroll_list$design$samples$variable ) - - # if distance cannot be computed (because of missing values) pad with - # zeros and recalculate - if (class(cluster_cols) == "try-error") { - pad_matrix <- matrix(0, ncol = 2, nrow = ncol(quant_matrix)) - colnames(pad_matrix) <- c("pad1", "pad2") - quant_matrix_pad <- cbind(t(quant_matrix), pad_matrix) - - cluster_cols <- stats::hclust( - stats::dist(quant_matrix_pad), - method = "ward.D2" - ) - } - - output$columns <- colnames(quant_matrix)[cluster_cols$order] + checkmate::assertChoice(value_var, mzroll_list$design$measurements$variable) + checkmate::assertClass(mzroll_list$measurements[[value_var]], "numeric") + if (!is.null(fill_var)) { + checkmate::assertChoice(fill_var, mzroll_list$design$samples$variable) } - - output -} - - -#' Plot Barplots -#' -#' @inheritParams test_mzroll_list -#' @param groupIds groupIds for compounds to plot -#' -#' @export -plot_barplot <- function(mzroll_list, groupIds) { - stopifnot(class(groupIds) %in% c("numeric", "integer")) - - reduced_groups <- mzroll_list$peakgroups %>% + + reduced_groups <- mzroll_list$features %>% dplyr::filter(groupId %in% groupIds) - reduced_peaks <- mzroll_list$peaks %>% + ordered_samples <- mzroll_list$samples %>% + dplyr::arrange(!!!rlang::syms(grouping_vars)) %>% + dplyr::mutate(sample_label = paste(!!!rlang::syms(grouping_vars)), + sample_label = factor( + sample_label, + levels = unique(sample_label) + )) + + reduced_peaks <- mzroll_list$measurements %>% dplyr::inner_join(reduced_groups, by = "groupId") %>% - dplyr::left_join(mzroll_list$samples, by = "sampleId") - + dplyr::left_join(ordered_samples, by = "sampleId") + standard_errors <- reduced_peaks %>% - dplyr::group_by(groupId, peak_label, `sample description`) %>% + dplyr::group_by(groupId, peak_label, sample_label) %>% dplyr::summarize( abund_mean = mean(log2_abundance), - abund_se = stats::sd(log2_abundance) / sqrt(dplyr::n()) - ) %>% - dplyr::ungroup() + abund_se = stats::sd(log2_abundance) / sqrt(dplyr::n()), + .groups = "drop" + ) + + if (!is.null(fill_var)) { + standard_errors <- standard_errors %>% + dplyr::left_join( + ordered_samples %>% + dplyr::select(sample_label, !!rlang::sym(fill_var)), + by = "sample_label" + ) + } - ggplot(standard_errors, aes(x = `sample description`)) + + if (!is.null(fill_var)) { + grob <- ggplot(standard_errors, aes( + x = sample_label, + fill = !!rlang::sym(fill_var) + )) + } else { + grob <- ggplot(standard_errors, aes(x = sample_label)) + } + + grob + geom_col(aes(y = 2^abund_mean)) + - geom_errorbar(aes( - ymin = 2^(abund_mean - abund_se), - ymax = 2^(abund_mean + abund_se) - )) + - facet_wrap(~peak_label, scales = "free_y") + + geom_errorbar( + data = standard_errors %>% + dplyr::filter(!is.na(abund_se)), + aes(ymin = 2^(abund_mean - abund_se), ymax = 2^(abund_mean + abund_se)) + ) + + facet_wrap(~ peak_label, scales = "free_y") + scale_x_discrete("Sample description") + scale_y_continuous(expression(IC %+-% "1SE")) + theme_bw() + diff --git a/README.md b/README.md index be97137..4ecfa96 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ After installing this package, the package functions can be accessed by just loa library(claman) ``` -Once calicomics is installed, run package vignette(s): +Once claman is installed, run package vignette(s): ```r vignette(package = "claman", topic = "metabolomics_example") diff --git a/assets/data/boer2010.Rds b/assets/data/boer2010.Rds new file mode 100644 index 0000000..70b360d Binary files /dev/null and b/assets/data/boer2010.Rds differ diff --git a/data-raw/nplug.R b/data-raw/nplug.R new file mode 100644 index 0000000..3ce849e --- /dev/null +++ b/data-raw/nplug.R @@ -0,0 +1,247 @@ +## code to prepare `NPLUG` dataset goes here + +library(dplyr) +library(claman) + +# save the raw data file to the top-level data directory to future-proof +# against journal URL changes. + +raw_data_url <- "https://www.molbiolcell.org/doi/suppl/10.1091/mbc.e09-07-0597/suppl_file/data_set_2.txt" +local_save <- file.path("assets", "data", "boer2010.Rds") + +if (file.exists(local_save)) { + nplug_raw <- readRDS(local_save) +} else { + nplug_raw <- readr::read_tsv(raw_data_url) %>% + dplyr::rename(sample_name = X1) + + saveRDS(nplug_raw, local_save) +} + +# sample meta-data + +nplug_samples <- nplug_raw %>% + dplyr::select( + sample_name, + month = Month, + replicate = Which, + DR = Gr, + limitation = Nutr, + exp_ref = `Exp/Ref`, + extraction = Method + ) %>% + dplyr::mutate( + month = month.abb[month], + replicate = toupper(replicate) + ) %>% + dplyr::mutate(limitation = ifelse( + limitation %in% c("R", "R2", "R3"), + "PO4", + limitation)) + +# use DR when finding distinct experimental conditions +distinct_exp_conditions <- nplug_samples %>% + filter(exp_ref == "exp") %>% + distinct(month, limitation, DR, extraction) %>% + mutate(condition = 1:n()) + +# ignore DR when finding distinct reference conditions +distinct_ref_conditions <- nplug_samples %>% + filter(exp_ref == "ref") %>% + distinct(month, extraction) %>% + mutate(condition = max(distinct_exp_conditions$condition) + 1:n()) + +nplug_samples <- bind_rows( + nplug_samples %>% + filter(exp_ref == "exp") %>% + left_join( + distinct_exp_conditions, + by = c("month", "limitation", "DR", "extraction") + ), + nplug_samples %>% + filter(exp_ref == "ref") %>% + mutate(DR = 0.05) %>% + left_join( + distinct_ref_conditions, + by = c("month", "extraction") + ) +) %>% + left_join( + distinct_ref_conditions %>% + rename(reference = condition), + by = c("month", "extraction") + ) + +usethis::use_data(nplug_samples, overwrite = TRUE) + +# create mzroll file + +nplug_raw <- nplug_raw %>% + dplyr::mutate(sampleId = 1:n()) + +# setup measurements +abundances <- nplug_raw %>% + dplyr::select(-c(sample_name:Method)) %>% + tidyr::gather(compoundName, peakAreaTop, -sampleId) + +peakgroups <- abundances %>% + dplyr::distinct(compoundName) %>% + dplyr::mutate( + groupId = 1:n(), + # indicate that the peakgroup has been validated + label = "g" + ) + +peaks <- abundances %>% + dplyr::inner_join( + peakgroups %>% + dplyr::select(groupId, compoundName), + by = "compoundName" + ) %>% + dplyr::select(-compoundName) %>% + dplyr::mutate(peakId = 1:dplyr::n()) + +mzroll_data <- list( + samples = nplug_raw %>% dplyr::select(sampleId, name = sample_name), + peakgroups = peakgroups, + peaks = peaks +) %>% + clamr::expand_with_mzroll_defaults() + +mzroll_db_path <- file.path("inst", "extdata", "nplug.mzrollDB") +if (file.exists(mzroll_db_path)) { + file.remove(mzroll_db_path) +} + +mzroll_db_con <- DBI::dbConnect(RSQLite::SQLite(), mzroll_db_path) + +purrr::walk2( + names(mzroll_data), + mzroll_data, + DBI::dbCreateTable, + conn = mzroll_db_con +) + +purrr::walk2( + names(mzroll_data), + mzroll_data, + DBI::dbAppendTable, + conn = mzroll_db_con +) + +DBI::dbDisconnect(mzroll_db_con) + +# save compound metadata + +pathways <- list( + "Pentose Phosphate Pathway" = c("6-phospho-D-gluconate", "Ribose-P", + "D-erythrose-4-phosphate", "D-gluconate", + "D-glucono-delta-lactone-6-phosphate", + "D-sedoheptulose-7-phosphate", + "pyridoxine", "riboflavin"), + "Glycolysis" = c("1,3-diphopshateglycerate", "3-phosphoglycerate", + "D-glyceraldehdye-3-phosphate", "dihydroxy-acetone-phosphate", + "fructose-1,6-bisphosphate", "glycerate", "hexose-phosphate", + "pyruvate", "trehalose/sucrose", "trehalose-6-phosphate", + "Lactate"), + "Purine Synthesis" = c("adenosine", "cyclic-AMP", "dATP", "deoxyadenosine", + "deoxyguanosine", "guanine", "guanosine", + "hypoxanthine", "inosine", "thiamine", "xanthine", + "dGDP"), + "Pyrimidines" = c("CTP", "CDP", "UTP", "UDP", "OMP", "cytidine", "cytosine", + "dCTP", "dihydroorotate", "dTTP", "N-carbamoyl-L-aspartate", + "orotate", "UDP-D-glucose", "UDP-N-acetyl-glucosamine", + "uridine"), + "Amino Acids" = c("alanine", "arginine", "asparagine", "aspartate", "glutamate", + "glutamine", "glycine", "histidine", "leucine/isoleucine", + "lysine", "methionine", "phenylalanine", "proline", + "serine", "threonine", "tryptophan", "tyrosine", "valine"), + "Amino Acid Synthesis" = c("citrulline", "cystathionine", "histidinol", + "ornithine", "phenylpyruvate", "Arginino-succinate", + "Carbamyl phosphate", "Homoserine", "Hydroxyphenylpyruvate", + "N-acetyl-glutamate", "Prephenic acid", "PRPP"), + "Amino Acid Derived" = c("glutathione", "glutathione disulfide", + "N-acetyl-glucosamine-1-phosphate", "N-acetyl-glutamine", + "nicotinate", "pantothenate", "quinolinate", + "S-adenosyl-L-methionine", "Dimethylglycine"), + "TCA cycle" = c("acetyl-CoA", "a-ketoglutarate", "aconitate", "citrate", + "citrate/isocitrate", "fumarate", "isocitrate", + "malate", "succinate"), + "Energetics" = c("ATP", "ADP", "AMP", "GTP", "GDP", "GMP", "FAD", "FMN", + "NAD+", "NADH", "NADP+"), + "Lipid metabolism" = c("choline", "3-hydroxy-3-methylglutaryl-CoA", + "trans, trans-farnesyl diphosphate")) + +nplug_compounds <- purrr::map(names(pathways), + function(x){tibble::tibble(pathway = x, compoundName = pathways[[x]])}) %>% + dplyr::bind_rows() %>% + dplyr::select(compoundName, pathway) + +usethis::use_data(nplug_compounds, overwrite = TRUE) + +# create an annotated intermediate mzroll_list + +nplug_mzroll <- claman::process_mzroll(claman::nplug_mzroll()) +nplug_mzroll_augmented <- claman::merge_samples_tbl(nplug_mzroll, nplug_samples, "sample_name") +nplug_mzroll_augmented <- claman::merge_compounds_tbl(nplug_mzroll_augmented, nplug_compounds) + +usethis::use_data(nplug_mzroll_augmented, overwrite = TRUE) + +# mzroll processed to account for batch effects and summarized based on distinct biological conditions + +mzroll_list_distinct_conditions <- collapse_injections( + nplug_mzroll_augmented, + grouping_vars = "condition", + peak_quant_vars = c("log2_abundance", "centered_log2_abundance"), + collapse_fxn = "mean" +) + +mzroll_list_normalized <- normalize_peaks( + mzroll_list_distinct_conditions, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref" +) %>% + # having normalized by the common reference, we can re-center the data + # since slow-phosphate limited growth is not a biological reference. + romic::center_tomic(measurement_vars = "normalized_log2_abundance") + +final_processed_data <- mzroll_list_normalized %>% + # retain just experimental samples + romic::filter_tomic( + filter_type = "category", + filter_table = "samples", + filter_variable = "exp_ref", + filter_value = "exp" + ) %>% + # retain only filter extraction + romic::filter_tomic( + filter_type = "category", + filter_table = "samples", + filter_variable = "extraction", + filter_value = "filter" + ) + +# clean-up sample data +renamed_samples <- final_processed_data$samples %>% + select(sampleId, limitation, DR) %>% + mutate(name = glue::glue( + "{stringr::str_sub(limitation, 1, 1)}{round(DR,2)}" + )) %>% + group_by(name) %>% + mutate( + name = case_when(n() == 1 ~ name, + TRUE ~ paste0(name, "-", 1:n())) + ) %>% + ungroup() + +nplug_mzroll_normalized <- romic::update_tomic( + final_processed_data, + renamed_samples +) + +usethis::use_data(nplug_mzroll_normalized, overwrite = TRUE) + diff --git a/data/X0106_paths.rda b/data/X0106_paths.rda deleted file mode 100644 index 395b43b..0000000 Binary files a/data/X0106_paths.rda and /dev/null differ diff --git a/data/nplug_compounds.rda b/data/nplug_compounds.rda new file mode 100644 index 0000000..1b7dfd2 Binary files /dev/null and b/data/nplug_compounds.rda differ diff --git a/data/nplug_mzroll_augmented.rda b/data/nplug_mzroll_augmented.rda new file mode 100644 index 0000000..63c49e4 Binary files /dev/null and b/data/nplug_mzroll_augmented.rda differ diff --git a/data/nplug_mzroll_normalized.rda b/data/nplug_mzroll_normalized.rda new file mode 100644 index 0000000..3f30102 Binary files /dev/null and b/data/nplug_mzroll_normalized.rda differ diff --git a/data/nplug_samples.rda b/data/nplug_samples.rda new file mode 100644 index 0000000..6355757 Binary files /dev/null and b/data/nplug_samples.rda differ diff --git a/inst/extdata/nplug.mzrollDB b/inst/extdata/nplug.mzrollDB new file mode 100644 index 0000000..49cd5ad Binary files /dev/null and b/inst/extdata/nplug.mzrollDB differ diff --git a/man/X0106_paths.Rd b/man/X0106_paths.Rd deleted file mode 100644 index 84fab1e..0000000 --- a/man/X0106_paths.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\docType{data} -\name{X0106_paths} -\alias{X0106_paths} -\title{Paths for X0106 examples} -\format{ -An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 9 rows and 2 columns. -} -\usage{ -X0106_paths -} -\description{ -methods and paths to X0106 example mzroll datasets. -} -\examples{ -\dontrun{ -X0106_paths <- tibble::tribble( - ~method_tag, ~mzroll_db_path, - "metabolites (neg)", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M001-peakdetector.mzrollDB", - "metabolites (pos) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M002-peakdetector.mzrollDB", - "metabolites (pos) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M002-peakdetector2.mzrollDB", - "lipids (neg) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-1.mzrollDB", - "lipids (neg) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2.mzrollDB", - "lipids (neg) - 3", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2b-AA-BDP-CL.mzrollDB", - "lipids (neg) - 4", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M004A-2b-AA-BDP-CL+Stds.mzrollDB", - "lipids (pos) - 1", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M005A-peakdetector.mzrollDB", - "lipids (pos) - 2", "/tmp/mzkit_assets/X0106_mzrollDBs/X0106-M005A-peakdetectorMGDGTGStds.mzrollDB" -) - -usethis::use_data(X0106_paths, overwrite = TRUE) -} -} -\keyword{datasets} diff --git a/man/aggregate_mzroll_nest.Rd b/man/aggregate_mzroll_nest.Rd index e0012c7..353c4cf 100644 --- a/man/aggregate_mzroll_nest.Rd +++ b/man/aggregate_mzroll_nest.Rd @@ -4,11 +4,13 @@ \alias{aggregate_mzroll_nest} \title{Aggregate mzroll lists} \usage{ -aggregate_mzroll_nest(mzroll_list_nest) +aggregate_mzroll_nest(mzroll_list_nest, samples_tbl) } \arguments{ \item{mzroll_list_nest}{a nested list of mzroll_lists produced from \link{process_mzroll}.} + +\item{samples_tbl}{Table of sample metadata} } \value{ a single mzroll_list diff --git a/man/augment_samples_with_samplesheet.Rd b/man/augment_samples_with_samplesheet.Rd deleted file mode 100644 index d96ab21..0000000 --- a/man/augment_samples_with_samplesheet.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/import_mzroll.R -\name{augment_samples_with_samplesheet} -\alias{augment_samples_with_samplesheet} -\title{Augment Samples with Samplesheet} -\usage{ -augment_samples_with_samplesheet(samples, sample_sheet_list) -} -\arguments{ -\item{samples}{samples table generated within \link{process_mzroll}.} - -\item{sample_sheet_list}{list of googlesheets information describing -experimental design - generated with \link{import_sample_sheet}.} -} -\value{ -samples with added metadata from sample_sheet_list -Issue 297: This block was adjusted, and ultimately reverted - Make sure there are only 1-1 substring matches between samples im - dataset and sample sheet - -e. g. if samples are named "C_200_B.mzML" and "C_2.mzML", "C_2" - as an MS ID string will match to both samples. -To avoid this problem in this case, the MS ID string could be - "C_2." or "C_2.mzML" instead of "C_2". -} -\description{ -Augment Samples with Samplesheet -} diff --git a/man/calicomics.Rd b/man/calicomics.Rd index efecce4..229a27a 100644 --- a/man/calicomics.Rd +++ b/man/calicomics.Rd @@ -1,12 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/claman.R \docType{package} \name{calicomics} \alias{calicomics} -\title{\code{calicomics} package} +\title{\code{claman} package} \description{ -Calicomics can read .mzrollDB files created using MAVEN or Quahog into an -mzroll_list. These lists can then be modified through normalization, -signal flooring, and filtering. Differential abundance analysis of -metabolites can then be performed along with a variety of visualizations. +Claman (Calico Lipidomics And Metabolomics ANalysis) can read .mzrollDB +files created using MAVEN or Quahog into an mzroll_list. These lists +can then be modified through normalization, signal flooring, and filtering. +Differential abundance analysis of metabolites can then be performed along +with a variety of visualizations. } diff --git a/man/collapse_injections.Rd b/man/collapse_injections.Rd index 6c5eb66..a7b0b31 100644 --- a/man/collapse_injections.Rd +++ b/man/collapse_injections.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mutate_mzroll_list.R +% Please edit documentation in R/injections.R \name{collapse_injections} \alias{collapse_injections} \title{Collapse Injections} @@ -16,10 +16,10 @@ collapse_injections( \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} \item{grouping_vars}{character vector of sample variables to use for @@ -28,7 +28,7 @@ grouping} \item{peak_quant_vars}{character vector of quantification variables to average} -\item{collapse_fxn}{function to use for collapse} +\item{collapse_fxn}{function name to use for collapse} } \value{ a \code{\link{process_mzroll}} collapsed across grouping_vars @@ -36,3 +36,10 @@ a \code{\link{process_mzroll}} collapsed across grouping_vars \description{ Collapse Injections } +\examples{ +collapse_injections( + nplug_mzroll_augmented, + grouping_vars = "condition", + peak_quant_vars = "log2_abundance" + ) +} diff --git a/man/configure_db_access.Rd b/man/configure_db_access.Rd deleted file mode 100644 index ecf03be..0000000 --- a/man/configure_db_access.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{configure_db_access} -\alias{configure_db_access} -\title{Configure Database Access} -\usage{ -configure_db_access(standards_db = "standard") -} -\arguments{ -\item{standards_db}{type of standards database to connect to: -\itemize{ - \item{standard: just standards} - \item{extended: standards, external libraries and predicted spectra} - }} -} -\value{ -a list containing connections to the standards and systematic - compounds database -} -\description{ -Setup and test connection to Calico databases -} diff --git a/man/create_sqlite_con.Rd b/man/create_sqlite_con.Rd new file mode 100644 index 0000000..2fea12f --- /dev/null +++ b/man/create_sqlite_con.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/import_mzroll.R +\name{create_sqlite_con} +\alias{create_sqlite_con} +\title{Create SQLite Connection} +\usage{ +create_sqlite_con(sqlite_path) +} +\arguments{ +\item{sqlite_path}{path to sqlite file} +} +\value{ +sqlite connection +} +\description{ +Open a connection to an SQLite database +} diff --git a/man/diffex_mzroll.Rd b/man/diffex_mzroll.Rd index 0535fc4..53b12af 100644 --- a/man/diffex_mzroll.Rd +++ b/man/diffex_mzroll.Rd @@ -4,25 +4,20 @@ \alias{diffex_mzroll} \title{Differential Expression Analysis on Mzroll List} \usage{ -diffex_mzroll( - mzroll_list, - value.var = "centered_log2_abundance", - test_model, - null_model = NULL -) +diffex_mzroll(mzroll_list, value_var, test_model, null_model = NULL) } \arguments{ \item{mzroll_list}{output of \link{process_mzroll} or \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} -\item{value.var}{which variable in "peaks" to use for quantification.} +\item{value_var}{measurement variable} \item{test_model}{a RHS formula for regression} @@ -36,12 +31,16 @@ a tibble of significance tests for each feature Differential Expression Analysis on Mzroll List } \examples{ -library(dplyr) - -mzroll_list <- readRDS( - authutils::get_clamr_assets("X0083-mzroll-list.Rds") -) \%>\% - floor_peaks(12) - -diffex_mzroll(mzroll_list, test_model = "mutation + sex") +diffex_mzroll( + nplug_mzroll_normalized, + "normalized_log2_abundance", + "limitation + limitation:DR + 0" + ) + +diffex_mzroll( + nplug_mzroll_normalized, + "normalized_log2_abundance", + "limitation + 0", + "0" + ) } diff --git a/man/filter_groupIds.Rd b/man/filter_groupIds.Rd index 60657e1..b92ee0a 100644 --- a/man/filter_groupIds.Rd +++ b/man/filter_groupIds.Rd @@ -11,10 +11,10 @@ filter_groupIds(mzroll_list, groupIds, invert = FALSE) \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} \item{groupIds}{groupIds to retain} @@ -25,3 +25,7 @@ provided groupIds} \description{ Remove all data besides those from provided groupIds } +\examples{ +filter_groupIds(nplug_mzroll_normalized, 1:10) + +} diff --git a/man/filter_peakgroups_quo.Rd b/man/filter_peakgroups_quo.Rd deleted file mode 100644 index b598080..0000000 --- a/man/filter_peakgroups_quo.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter_mzroll_list.R -\name{filter_peakgroups_quo} -\alias{filter_peakgroups_quo} -\title{Filter Calicomics Peakgroups} -\usage{ -filter_peakgroups_quo(mzroll_list, filter_quosure) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{filter_quosure}{quosure to use to filter peakgroups} -} -\value{ -an \code{mzroll_list} with peakgroups (and corresponding peaks) - removed -} -\description{ -Remove peakgroups based on peakgroups table variables -} -\examples{ -mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) -filter_quosure <- rlang::quo(focus_pathway \%in\% "Pyrimidine metabolism") -filter_peakgroups_quo(mzroll_list, filter_quosure) -} diff --git a/man/filter_samples_quo.Rd b/man/filter_samples_quo.Rd deleted file mode 100644 index 368d862..0000000 --- a/man/filter_samples_quo.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter_mzroll_list.R -\name{filter_samples_quo} -\alias{filter_samples_quo} -\title{Filter Calicomics Samples} -\usage{ -filter_samples_quo(mzroll_list, filter_quosure) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{filter_quosure}{quosure to use to filter samples} -} -\value{ -an \code{mzroll_list} with samples (and corresponding peaks) removed -} -\description{ -Remove samples based on sample table variables -} -\examples{ -mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) -filter_quosure <- rlang::quo(sex == "Male") -filter_samples_quo(mzroll_list, filter_quosure) -} diff --git a/man/find_pathway_enrichments.Rd b/man/find_pathway_enrichments.Rd new file mode 100644 index 0000000..e4c4731 --- /dev/null +++ b/man/find_pathway_enrichments.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pathway_enrichment.R +\name{find_pathway_enrichments} +\alias{find_pathway_enrichments} +\title{Find Pathway Enrichments} +\usage{ +find_pathway_enrichments( + mzroll_list, + regression_significance, + pathway_list, + ranking_measure = "statistic", + test_absolute_effects = TRUE +) +} +\arguments{ +\item{mzroll_list}{output of \link{process_mzroll} or + \link{process_mzroll_multi} + +\itemize{ + \item{features: one row per unique analyte (defined by a + unique groupId)}, + \item{samples: one row per unique sample (defined by a unique sampleId)}, + \item{measurements: one row per peak (samples x peakgroups)} + }} + +\item{regression_significance}{returned by \code{\link{diffex_mzroll}}; +a tibble of tests performed.} + +\item{pathway_list}{a named list where names are pathways and entries +are groupIds belonging to each pathway} + +\item{ranking_measure}{variable in \code{regression_significance} to use +when calculating enrichment.} + +\item{test_absolute_effects}{If TRUE then only consider the magnitude of +test-statistics when calculating rankings for enrichment; if FALSE then +consider sign as well.} +} +\value{ +a list containing: +\itemize{ + \item{enrichment_table: a tibble containing each term's enrichment for + every pathway and an enrichment_plot} + \item{enrichment_plots: pre-generated plots containing the most + significant enrichments for each term} + } +} +\description{ +Find Pathway Enrichments +} +\examples{ + +library(dplyr) + +regression_significance <- diffex_mzroll( + nplug_mzroll_normalized, + "normalized_log2_abundance", + "limitation + limitation:DR + 0" + ) +pathway_nest <- nplug_mzroll_normalized$features \%>\% + dplyr::select(groupId, pathway) \%>\% + tidyr::nest(pathway_members = groupId) + +pathway_list <- purrr::map( + pathway_nest$pathway_members, + function(x) { + as.character(x$groupId) + } + ) +names(pathway_list) <- pathway_nest$pathway + +find_pathway_enrichments( + mzroll_list = nplug_mzroll_normalized, + regression_significance, + pathway_list, + test_absolute_effects = FALSE + ) + +} diff --git a/man/find_tracking_sheet.Rd b/man/find_tracking_sheet.Rd deleted file mode 100644 index aeeb5e3..0000000 --- a/man/find_tracking_sheet.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sample_metadata.R -\name{find_tracking_sheet} -\alias{find_tracking_sheet} -\title{Find Tracking Sheet} -\usage{ -find_tracking_sheet(pattern) -} -\arguments{ -\item{pattern}{Pattern to look for. - -The default interpretation is a regular expression, as described -in \link[stringi:about_search_regex]{stringi::stringi-search-regex}. Control options with -\code{\link[stringr:modifiers]{regex()}}. - -Match a fixed string (i.e. by comparing only bytes), using -\code{\link[stringr:modifiers]{fixed()}}. This is fast, but approximate. Generally, -for matching human text, you'll want \code{\link[stringr:modifiers]{coll()}} which -respects character matching rules for the specified locale. - -Match character, word, line and sentence boundaries with -\code{\link[stringr:modifiers]{boundary()}}. An empty pattern, "", is equivalent to -\code{boundary("character")}.} -} -\value{ -tracking_sheet_id: if a unique tracking sheet is found, then - return the googlesheets id -} -\description{ -Locate the meta data tracking sheet for the project -} -\examples{ -\dontrun{ -find_tracking_sheet(pattern = "X0083") -} - -} diff --git a/man/floor_peaks.Rd b/man/floor_peaks.Rd index 900023d..66e4ae3 100644 --- a/man/floor_peaks.Rd +++ b/man/floor_peaks.Rd @@ -11,10 +11,10 @@ floor_peaks(mzroll_list, log2_floor_value = 12) \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} \item{log2_floor_value}{minimum value to set for low abundance or @@ -27,3 +27,7 @@ missing peaks} Set a minimum peak abundance of floor_value for low abundance and undetected peaks. } +\examples{ +floored_peaks <- floor_peaks(nplug_mzroll_augmented, 12) + +} diff --git a/man/hclust_order.Rd b/man/hclust_order.Rd deleted file mode 100644 index 8275717..0000000 --- a/man/hclust_order.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/visualizations.R -\name{hclust_order} -\alias{hclust_order} -\title{Hierarchical clustering order} -\usage{ -hclust_order(df, feature_pk, sample_pk, measurement_var, cluster_dim) -} -\arguments{ -\item{df}{data.frame to cluster} - -\item{feature_pk}{variable uniquely defining a row} - -\item{sample_pk}{variable uniquely definining a sample} - -\item{measurement_var}{measurement variables} - -\item{cluster_dim}{rows, columns, or both} -} -\value{ -a list containing a hierarchically clustered set of rows - and/or columns -} -\description{ -Format and hierarchically cluster a data.frame. If hclust could not - normally be produced (usually because no samples are in common for a - feature) pad the matrix with zeros and still calculate the distance -} diff --git a/man/import_sample_sheet.Rd b/man/import_sample_sheet.Rd deleted file mode 100644 index 63de314..0000000 --- a/man/import_sample_sheet.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sample_metadata.R -\name{import_sample_sheet} -\alias{import_sample_sheet} -\title{Import Sample List} -\usage{ -import_sample_sheet(pattern) -} -\arguments{ -\item{pattern}{Pattern to look for. - -The default interpretation is a regular expression, as described -in \link[stringi:about_search_regex]{stringi::stringi-search-regex}. Control options with -\code{\link[stringr:modifiers]{regex()}}. - -Match a fixed string (i.e. by comparing only bytes), using -\code{\link[stringr:modifiers]{fixed()}}. This is fast, but approximate. Generally, -for matching human text, you'll want \code{\link[stringr:modifiers]{coll()}} which -respects character matching rules for the specified locale. - -Match character, word, line and sentence boundaries with -\code{\link[stringr:modifiers]{boundary()}}. An empty pattern, "", is equivalent to -\code{boundary("character")}.} -} -\description{ -Find, read and process a sample meta sheet from googlesheets -} -\examples{ -\dontrun{ -import_sample_sheet(pattern = "X0083") -} - -} diff --git a/man/lipid_components.Rd b/man/lipid_components.Rd index d6dea5c..9300b73 100644 --- a/man/lipid_components.Rd +++ b/man/lipid_components.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R +% Please edit documentation in R/sample_metadata.R \name{lipid_components} \alias{lipid_components} \title{Generate relevant lipid components from compound name} diff --git a/man/merge_compounds_tbl.Rd b/man/merge_compounds_tbl.Rd new file mode 100644 index 0000000..bfebceb --- /dev/null +++ b/man/merge_compounds_tbl.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compounds_metadata.R +\name{merge_compounds_tbl} +\alias{merge_compounds_tbl} +\title{Merge Compounds Table} +\usage{ +merge_compounds_tbl(mzroll_list, compounds_tbl, join_by = "compoundName") +} +\arguments{ +\item{mzroll_list}{output of \link{process_mzroll} or + \link{process_mzroll_multi} + +\itemize{ + \item{features: one row per unique analyte (defined by a + unique groupId)}, + \item{samples: one row per unique sample (defined by a unique sampleId)}, + \item{measurements: one row per peak (samples x peakgroups)} + }} + +\item{compounds_tbl}{Table of compound metadata} + +\item{join_by}{variable shared by mzroll peakgroups and compounds_tbl +to merge results by.} +} +\description{ +Merge a table of compound information with an existing mzroll_list +} +\examples{ +mzroll_list <- process_mzroll(nplug_mzroll()) +compounds_tbl <- nplug_compounds +merge_compounds_tbl(mzroll_list, compounds_tbl) + +} diff --git a/man/merge_samples_tbl.Rd b/man/merge_samples_tbl.Rd new file mode 100644 index 0000000..b045948 --- /dev/null +++ b/man/merge_samples_tbl.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_metadata.R +\name{merge_samples_tbl} +\alias{merge_samples_tbl} +\title{Merge Samples Table} +\usage{ +merge_samples_tbl(mzroll_list, samples_tbl, id_strings, exact = TRUE) +} +\arguments{ +\item{mzroll_list}{output of \link{process_mzroll} or + \link{process_mzroll_multi} + +\itemize{ + \item{features: one row per unique analyte (defined by a + unique groupId)}, + \item{samples: one row per unique sample (defined by a unique sampleId)}, + \item{measurements: one row per peak (samples x peakgroups)} + }} + +\item{samples_tbl}{Table of sample metadata} + +\item{id_strings}{one or more variables which will be used to match +sample names.} + +\item{exact}{if true, an exact match between mzroll names and id_strings +will be found; if false, then substring matches will be used.} +} +\description{ +Merge a table of sample metadata with an existing mzroll_list +} +\examples{ +mzroll_list <- process_mzroll(nplug_mzroll()) +merge_samples_tbl(mzroll_list, nplug_samples, "sample_name") + +} diff --git a/man/mzroll_table.Rd b/man/mzroll_table.Rd deleted file mode 100644 index 0b1ca6d..0000000 --- a/man/mzroll_table.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{mzroll_table} -\alias{mzroll_table} -\title{Flatten mzroll list to single table} -\usage{ -mzroll_table(mzroll_list) -} -\arguments{ -\item{mzroll_list}{output of process_mzrollDB functions -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a unique - groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} -} -\value{ -mzroll_list (process_mzrollDB functions output) reformatted as a - single table -} -\description{ -Flatten mzroll list to single table using groupId and sampleId -} diff --git a/man/normalize_peaks.Rd b/man/normalize_peaks.Rd index 947561c..337f438 100644 --- a/man/normalize_peaks.Rd +++ b/man/normalize_peaks.Rd @@ -48,7 +48,8 @@ normalize_peaks.reference_condition( quant_peak_varname, norm_peak_varname, condition_varname = "condition #", - reference_varname = "reference condition #" + reference_varname = "reference condition #", + log2_floor_value = NA ) normalize_peaks.loess( @@ -64,10 +65,10 @@ normalize_peaks.loess( \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} \item{normalization_method}{Normalization method to apply @@ -82,9 +83,10 @@ normalize_peaks.loess( \code{norm_peak_varname})} }} -\item{quant_peak_varname}{variable in peaks to use for abundance} +\item{quant_peak_varname}{variable in measurements to use for abundance} -\item{norm_peak_varname}{variable in peaks to add for normalized abundance} +\item{norm_peak_varname}{variable in measurements to add for normalized +abundance} \item{...}{additional arguments to pass to normalization methods} @@ -104,40 +106,62 @@ reference samples} \item{condition_varname}{variable specifying each sample's condition} \item{weights_tribble}{a table containing weights and sample variables to -match them to} +match them to.} } \value{ a \code{mzroll_list} as generated by \code{\link{process_mzroll}} - a \code{norm_peak_varname} variable added to peaks + a \code{norm_peak_varname} variable added to measuremnets } \description{ -This normalization was reported in Kamphorst et al. 2015, see - \url{https://github.com/shackett/Pancreatic_tumor_metabolomics}. +Using a robust metabolomics normalization rescale ion counts based on the + consensus signal of each sample. + +Compare each sample to the reference samples within the same batch + +Compare each sample to the reference samples within the same batch } \details{ -compare each sample to the reference samples within the same batch - -compare each sample to the reference samples within the same batch +The robust median polish was reported in Kamphorst et al. 2015, see + \url{https://github.com/shackett/Pancreatic_tumor_metabolomics}. reference_varname specifies which condition to contrast a given sample to (using the condition values from condition_varname) } \examples{ -mzroll_list <- readRDS(authutils::get_clamr_assets("X0083-mzroll-list.Rds")) - normalize_peaks( - mzroll_list, + nplug_mzroll_augmented, "median polish", quant_peak_varname = "log2_abundance", - norm_peak_varname = "normalized_log2_IC" + norm_peak_varname = "normalized_log2_abundance" ) + normalize_peaks( - mzroll_list, + nplug_mzroll_augmented, "center batches", quant_peak_varname = "log2_abundance", - norm_peak_varname = "normalized_log2_IC", - batch_varnames = "sex", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = "month", centering_fxn = median ) + +normalize_peaks( + nplug_mzroll_augmented, + normalization_method = "reference condition", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + condition_varname = "condition", + reference_varname = "reference" +) + +normalize_peaks( + nplug_mzroll_augmented, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref" +) + } diff --git a/man/nplug_compounds.Rd b/man/nplug_compounds.Rd new file mode 100644 index 0000000..b1e7434 --- /dev/null +++ b/man/nplug_compounds.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{nplug_compounds} +\alias{nplug_compounds} +\title{NPLUG samples} +\format{ +An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 108 rows and 2 columns. +} +\usage{ +nplug_compounds +} +\description{ +A table of meta-data for all NPLUG features with variables: +\describe{ + \item{compoundName}{compound name corresponding to the compoundName + variable in the nplug_mzroll's features table.} + \item{pathway}{A (rough) categorization of compounds into metabolic + pathways.} + } +} +\seealso{ +Other nplug: +\code{\link{nplug_mzroll_augmented}}, +\code{\link{nplug_mzroll_normalized}}, +\code{\link{nplug_mzroll}()}, +\code{\link{nplug_samples}} +} +\concept{nplug} +\keyword{datasets} diff --git a/man/nplug_mzroll.Rd b/man/nplug_mzroll.Rd new file mode 100644 index 0000000..fb796d2 --- /dev/null +++ b/man/nplug_mzroll.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{nplug_mzroll} +\alias{nplug_mzroll} +\title{Growth-Limiting Metabolites} +\usage{ +nplug_mzroll() +} +\value{ +path to the NPLUG .mzrollDB dataset +} +\description{ +The NPLUG experiment has a full factorial design exploring relationship + between nutrient limitation and growth rate in yeast. +} +\details{ +The experiment contains 25 primary experimental conditions: +\itemize{ + \item{5 limiting nutrients: nitrogen, phosphorous, leucine, uracil and + glucose (i.e., NPLUG!)} + \item{5 dilution rates (growth rate / ln(2)): from 0.05-0.3 1/hrs} +} + +Metabolomics of + limited yeast cultures growing at different rates. +} +\seealso{ +Other nplug: +\code{\link{nplug_compounds}}, +\code{\link{nplug_mzroll_augmented}}, +\code{\link{nplug_mzroll_normalized}}, +\code{\link{nplug_samples}} +} +\concept{nplug} diff --git a/man/nplug_mzroll_augmented.Rd b/man/nplug_mzroll_augmented.Rd new file mode 100644 index 0000000..ab63202 --- /dev/null +++ b/man/nplug_mzroll_augmented.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{nplug_mzroll_augmented} +\alias{nplug_mzroll_augmented} +\title{NPLUG MzRoll Augmented} +\format{ +An object of class \code{triple_omic} (inherits from \code{tomic}, \code{mzroll}) of length 4. +} +\usage{ +nplug_mzroll_augmented +} +\description{ +\link{nplug_mzroll} formatted with \link{process_mzroll} with sample + (\link{nplug_samples}) and compound (\link{nplug_samples}) merged. +} +\seealso{ +Other nplug: +\code{\link{nplug_compounds}}, +\code{\link{nplug_mzroll_normalized}}, +\code{\link{nplug_mzroll}()}, +\code{\link{nplug_samples}} +} +\concept{nplug} +\keyword{datasets} diff --git a/man/nplug_mzroll_normalized.Rd b/man/nplug_mzroll_normalized.Rd new file mode 100644 index 0000000..c3f2d6e --- /dev/null +++ b/man/nplug_mzroll_normalized.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{nplug_mzroll_normalized} +\alias{nplug_mzroll_normalized} +\title{NPLUG MzRoll Normalized} +\format{ +An object of class \code{triple_omic} (inherits from \code{tomic}, \code{mzroll}) of length 4. +} +\usage{ +nplug_mzroll_normalized +} +\description{ +\link{nplug_mzroll_augmented} with injections collapsed using + \link{collapse_injections}, followed by reference-sample normalization + using \link{normalize_peaks}. Finally, reference samples and samples + extracting using the pellet method were removed and sample names + were cleaned up. These steps are described in the NPLUG vignette. +} +\seealso{ +Other nplug: +\code{\link{nplug_compounds}}, +\code{\link{nplug_mzroll_augmented}}, +\code{\link{nplug_mzroll}()}, +\code{\link{nplug_samples}} +} +\concept{nplug} +\keyword{datasets} diff --git a/man/nplug_samples.Rd b/man/nplug_samples.Rd new file mode 100644 index 0000000..620afc6 --- /dev/null +++ b/man/nplug_samples.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{nplug_samples} +\alias{nplug_samples} +\title{NPLUG samples} +\format{ +An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 136 rows and 9 columns. +} +\usage{ +nplug_samples +} +\description{ +A table of meta-data for all NPLUG samples with variables: +\describe{ + \item{sample_name}{unique descriptor of each sample} + \item{month}{month of sample generation} + \item{replicate}{culture replicate} + \item{DR}{dilution rate of culture - this is proportional to cells' + growth rate} + \item{limitation}{nutrient limiting growth + \itemize{ + \item{GLU - carbon} + \item{LEU - leucine (in a Leu4 auxotroph)} + \item{NH4 - nitrogen} + \item{PO4 - phosphorous} + \item{URA - uracil (in a Ura2 auxotroph)} + } + } + \item{exp_ref}{experimental or reference condition} + \item{extraction}{filter- or pellet-based extraction} + \item{condition}{Integer value for each unique condition. Here, that is + a unique values of (month, limitation, DR, and extraction) for + experimental samples, and unique values of (month and extraction) for + reference samples.} + \item{reference}{Condition number that this sample should be compared to.} + } +} +\seealso{ +Other nplug: +\code{\link{nplug_compounds}}, +\code{\link{nplug_mzroll_augmented}}, +\code{\link{nplug_mzroll_normalized}}, +\code{\link{nplug_mzroll}()} +} +\concept{nplug} +\keyword{datasets} diff --git a/man/pathway_enrichments.Rd b/man/pathway_enrichments.Rd deleted file mode 100644 index 523ca4e..0000000 --- a/man/pathway_enrichments.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pathway_enrichment.R -\name{pathway_enrichments} -\alias{pathway_enrichments} -\title{Pathway Enrichments} -\usage{ -pathway_enrichments( - mzroll_list, - significance, - standard_databases, - test_absolute_effects = TRUE -) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{significance}{returned by \code{\link{diffex_mzroll}}; a tibble of -tests performed.} - -\item{standard_databases}{connections to Calico standards and systematic -compound databases - generated with \link{configure_db_access}.} - -\item{test_absolute_effects}{If TRUE then only consider the magnitude of -test-statistics when calculating rankings for enrichment; if FALSE then -consider sign as well.} -} -\value{ -a list containing: -\itemize{ - \item{enrichment_table: a tibble containing each term's enrichment for - every pathway and an enrichment_plot} - \item{enrichment_plots: pre-generated plots containing the most - significant enrichments for each term} - } -} -\description{ -Pathway Enrichments -} diff --git a/man/plot_barplot.Rd b/man/plot_barplot.Rd index c3152a3..92539e4 100644 --- a/man/plot_barplot.Rd +++ b/man/plot_barplot.Rd @@ -4,21 +4,39 @@ \alias{plot_barplot} \title{Plot Barplots} \usage{ -plot_barplot(mzroll_list, groupIds) +plot_barplot(mzroll_list, groupIds, grouping_vars, value_var, fill_var = NULL) } \arguments{ \item{mzroll_list}{output of \link{process_mzroll} or \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} \item{groupIds}{groupIds for compounds to plot} + +\item{grouping_vars}{variables to use for ordering samples and calculating +standard errors} + +\item{value_var}{measurement variable} + +\item{fill_var}{variable to use for filling bars} } \description{ Plot Barplots } +\examples{ + +plot_barplot( + mzroll_list = nplug_mzroll_normalized, + groupIds = 1:4, + grouping_vars = c("limitation", "DR"), + value_var = "normalized_log2_abundance", + fill_var = "limitation" + ) + +} diff --git a/man/plot_compare_injection.Rd b/man/plot_compare_injection.Rd new file mode 100644 index 0000000..05d104c --- /dev/null +++ b/man/plot_compare_injection.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/injections.R +\name{plot_compare_injection} +\alias{plot_compare_injection} +\title{Compare Injections} +\usage{ +plot_compare_injection( + mzroll_list, + grouping_vars = "condition", + peak_quant_var = "log2_abundance" +) +} +\arguments{ +\item{mzroll_list}{output of \link{process_mzroll} or + \link{process_mzroll_multi} + +\itemize{ + \item{features: one row per unique analyte (defined by a + unique groupId)}, + \item{samples: one row per unique sample (defined by a unique sampleId)}, + \item{measurements: one row per peak (samples x peakgroups)} + }} + +\item{grouping_vars}{character vector of sample variables to use for +grouping} + +\item{peak_quant_var}{variable to plot in comparison} +} +\description{ +Create a scatterplot of injection correlations. +} +\examples{ +plot_compare_injection( + nplug_mzroll_augmented, + grouping_vars = "condition", + peak_quant_var = "log2_abundance" + ) + +} diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd deleted file mode 100644 index 22eb8cb..0000000 --- a/man/plot_heatmap.Rd +++ /dev/null @@ -1,73 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/visualizations.R -\name{plot_heatmap} -\alias{plot_heatmap} -\title{Omics Heatmap} -\usage{ -plot_heatmap( - mzroll_list, - feature.var = "groupId", - sample.var = "name", - value.var = "centered_log2_abundance", - cluster_dim = "both", - change_threshold = 3, - plot_type = "plotly" -) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{feature.var}{variable from "peakgroups" to use as a unique feature -label.} - -\item{sample.var}{variable from "samples" to use as a unique sample label.} - -\item{value.var}{which variable in "peaks" to use for quantification.} - -\item{cluster_dim}{dimensions to cluster (row, column, or both)} - -\item{change_threshold}{values with a more extreme absolute change will be -thresholded to this value.} - -\item{plot_type}{plotly (for interactivity) or grob (for a static ggplot)} -} -\description{ -Generate a basic heatmap of all of your metabolomic and lipidomic data -} -\examples{ -library(dplyr) -mzroll_list <- authutils::get_clamr_assets("X0083-M001A.mzrollDB") \%>\% - process_mzroll( - standard_databases = NULL, - method_tag = "M001A", - only_identified = TRUE - ) \%>\% - floor_peaks(12) - -plot_heatmap( - mzroll_list, - sample.var = "sampleId", - feature.var = "peak_label", - change_threshold = 5, - cluster_dim = "rows", - plot_type = "grob" -) -\dontrun{ -plot_heatmap( - mzroll_list, - sample.var = "sampleId", - feature.var = "peak_label", - change_threshold = 5, - cluster_dim = "rows" -) -} - -} diff --git a/man/plot_pvalues.Rd b/man/plot_pvalues.Rd new file mode 100644 index 0000000..c834013 --- /dev/null +++ b/man/plot_pvalues.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{plot_pvalues} +\alias{plot_pvalues} +\title{P-value Histogram} +\usage{ +plot_pvalues(regression_significance) +} +\arguments{ +\item{regression_significance}{returned by \code{\link{diffex_mzroll}}; +a tibble of tests performed.} +} +\description{ +P-value Histogram +} +\examples{ +regression_significance <- diffex_mzroll( + nplug_mzroll_normalized, + "normalized_log2_abundance", + "limitation + limitation:DR + 0" + ) + +plot_pvalues(regression_significance) + +} diff --git a/man/plot_volcano.Rd b/man/plot_volcano.Rd new file mode 100644 index 0000000..916de8a --- /dev/null +++ b/man/plot_volcano.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/differential_expression.R +\name{plot_volcano} +\alias{plot_volcano} +\title{Volcano plot} +\usage{ +plot_volcano(regression_significance, max_p_trans = 10, FDR_cutoff = 0.1) +} +\arguments{ +\item{regression_significance}{returned by \code{\link{diffex_mzroll}}; +a tibble of tests performed.} + +\item{max_p_trans}{maximum value of -log10 pvalues to plot} + +\item{FDR_cutoff}{FDR cutoff to label for significance} +} +\value{ +a grob +} +\description{ +Volcano plot +} +\examples{ +regression_significance <- diffex_mzroll( + nplug_mzroll_normalized, + "normalized_log2_abundance", + "limitation + limitation:DR + 0" + ) + +plot_volcano(regression_significance, 10, 0.1) + +} diff --git a/man/process_mzroll.Rd b/man/process_mzroll.Rd index 1dd5947..327007b 100644 --- a/man/process_mzroll.Rd +++ b/man/process_mzroll.Rd @@ -6,51 +6,36 @@ \usage{ process_mzroll( mzroll_db_path, - standard_databases = NULL, - sample_sheet_list = NULL, - method_tag = "", only_identified = TRUE, - validate = FALSE + validate = FALSE, + method_tag = "" ) } \arguments{ \item{mzroll_db_path}{path to mzroll DB file} -\item{standard_databases}{connections to Calico standards and systematic -compound databases - generated with \link{configure_db_access}.} - -\item{sample_sheet_list}{list of googlesheets information describing -experimental design - generated with \link{import_sample_sheet}.} - -\item{method_tag}{what method was run (for the purpose of matching -meta-data and aggregating).} - \item{only_identified}{TRUE/FALSE, filter to only features which were identified.} \item{validate}{TRUE/FALSE, use meta-data to only name the subset of features which were manually validated.} + +\item{method_tag}{what method was run (for the purpose of matching +meta-data and aggregating).} } \value{ -an mzroll_list containing three tibbles: +a **triple_omic** from **romic** containing three tibbles: \itemize{ - \item{peakgroups: one row per unique analyte (defined by a unique + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} } + And, a design list which tracks the variables in each table. } \description{ Process mzRoll } \examples{ -library(dplyr) - -authutils::get_clamr_assets("X0083-M001A.mzrollDB") \%>\% - process_mzroll( - standard_databases = NULL, - method_tag = "M001A", - only_identified = TRUE, - validate = FALSE - ) +process_mzroll(nplug_mzroll()) } diff --git a/man/process_mzroll_identify_peakgroups.Rd b/man/process_mzroll_identify_peakgroups.Rd new file mode 100644 index 0000000..9f08f52 --- /dev/null +++ b/man/process_mzroll_identify_peakgroups.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/import_mzroll.R +\name{process_mzroll_identify_peakgroups} +\alias{process_mzroll_identify_peakgroups} +\title{Process MzRoll - Identify Peakgroups} +\usage{ +process_mzroll_identify_peakgroups(peakgroups, only_identified) +} +\arguments{ +\item{peakgroups}{a table of distinct ions with characteristic m/z and rt} + +\item{only_identified}{TRUE/FALSE, filter to only features which were +identified.} +} +\value{ +a table of peakgroups +} +\description{ +Either remove unidentified peakgroups or name unknowns using m/z and rt. +} diff --git a/man/process_mzroll_load_peakgroups.Rd b/man/process_mzroll_load_peakgroups.Rd new file mode 100644 index 0000000..e7fa851 --- /dev/null +++ b/man/process_mzroll_load_peakgroups.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/import_mzroll.R +\name{process_mzroll_load_peakgroups} +\alias{process_mzroll_load_peakgroups} +\title{Process MzRoll - Load Peakgroups} +\usage{ +process_mzroll_load_peakgroups(mzroll_db_con) +} +\arguments{ +\item{mzroll_db_con}{an SQLite connection to an mzrollDB database} +} +\value{ +a table of peakgroups +} +\description{ +Load peakgroups table and add mz and rt variables to peakgroups based on + peak-level data. +} diff --git a/man/process_mzroll_multi.Rd b/man/process_mzroll_multi.Rd index 91f5800..f6107ac 100644 --- a/man/process_mzroll_multi.Rd +++ b/man/process_mzroll_multi.Rd @@ -6,10 +6,11 @@ \usage{ process_mzroll_multi( mzroll_paths, - standard_databases, - sample_sheet_list, + samples_tbl, + id_strings, only_identified = TRUE, - validate = FALSE + validate = FALSE, + exact = TRUE ) } \arguments{ @@ -19,17 +20,19 @@ process_mzroll_multi( \item{mzroll_db_path: path to a mzrollDB file} }} -\item{standard_databases}{connections to Calico standards and systematic -compound databases - generated with \link{configure_db_access}.} +\item{samples_tbl}{Table of sample metadata} -\item{sample_sheet_list}{list of googlesheets information describing -experimental design - generated with \link{import_sample_sheet}.} +\item{id_strings}{one or more variables which will be used to match +sample names.} \item{only_identified}{TRUE/FALSE, filter to only features which were identified.} \item{validate}{TRUE/FALSE, use meta-data to only name the subset of features which were manually validated.} + +\item{exact}{if true, an exact match between mzroll names and id_strings +will be found; if false, then substring matches will be used.} } \value{ an mzroll_list containing three tibbles: @@ -43,3 +46,13 @@ an mzroll_list containing three tibbles: \description{ Aggregate multiple mzrollDB datasets which possess the same sample metadata } +\examples{ +mzroll_paths <- tibble::tribble( + ~method_tag, ~mzroll_db_path, + "method1", nplug_mzroll(), + "method2", nplug_mzroll() + ) + +process_mzroll_multi(mzroll_paths, nplug_samples, "sample_name") + +} diff --git a/man/process_mzroll_validate_peakgroups.Rd b/man/process_mzroll_validate_peakgroups.Rd new file mode 100644 index 0000000..d0fb636 --- /dev/null +++ b/man/process_mzroll_validate_peakgroups.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/import_mzroll.R +\name{process_mzroll_validate_peakgroups} +\alias{process_mzroll_validate_peakgroups} +\title{Process MzRoll - Validate Peakgroups} +\usage{ +process_mzroll_validate_peakgroups(peakgroups, validate) +} +\arguments{ +\item{peakgroups}{a table of distinct ions with characteristic m/z and rt} + +\item{validate}{TRUE/FALSE, use meta-data to only name the subset of +features which were manually validated.} +} +\value{ +a table of peakgroups +} +\description{ +If validate is True then remove unvalidated compoundNames (based on + peakgroup labels) and identify all unvalidated compounds with an + "is_unknown" variable. +} diff --git a/man/pvalue_histograms.Rd b/man/pvalue_histograms.Rd deleted file mode 100644 index a3abdb6..0000000 --- a/man/pvalue_histograms.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/differential_expression.R -\name{pvalue_histograms} -\alias{pvalue_histograms} -\title{P-value Histogram} -\usage{ -pvalue_histograms(regression_significance) -} -\arguments{ -\item{regression_significance}{output of diffex_mzroll} -} -\description{ -P-value Histogram -} diff --git a/man/query_systematic_compounds.Rd b/man/query_systematic_compounds.Rd deleted file mode 100644 index 3046642..0000000 --- a/man/query_systematic_compounds.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/systematic_db_query.R -\name{query_systematic_compounds} -\alias{query_systematic_compounds} -\title{Query Systematic Compounds} -\usage{ -query_systematic_compounds( - systematicCompoundIds = NULL, - systematic_compounds_con -) -} -\arguments{ -\item{systematicCompoundIds}{NULL to query all compounds or a numeric -vector of systematic compound IDs} - -\item{systematic_compounds_con}{A connection to an SQL database which -stores the structure and systematic IDs of well-studied compounds.} -} -\value{ -a list containing: -\itemize{ - \item{distinct_compounds: unique database entries} - \item{database_ids: systematic IDs of compounds in KEGG, HMDB, ...} - \item{compound_aliases: alternative names for a compound} - \item{metabolic_pathways: KEGG pathways associated with a metabolite} - } -} -\description{ -Query Systematic Compounds -} -\examples{ -\dontrun{ -systematic_compounds_con <- configure_db_access()$systematic_compounds_con -systematic_compounds <- query_systematic_compounds( - systematic_compounds_con = systematic_compounds_con -) -} -} diff --git a/man/read_sample_list.Rd b/man/read_sample_list.Rd deleted file mode 100644 index cec83fe..0000000 --- a/man/read_sample_list.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sample_metadata.R -\name{read_sample_list} -\alias{read_sample_list} -\title{Read Sample List} -\usage{ -read_sample_list(tracking_sheet_id) -} -\arguments{ -\item{tracking_sheet_id}{output of \link{find_tracking_sheet}} -} -\description{ -Read Sample List -} diff --git a/man/reconcile_mzroll_list.Rd b/man/reconcile_mzroll_list.Rd deleted file mode 100644 index 39e7387..0000000 --- a/man/reconcile_mzroll_list.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter_mzroll_list.R -\name{reconcile_mzroll_list} -\alias{reconcile_mzroll_list} -\title{Reconcile Mzroll List} -\usage{ -reconcile_mzroll_list(mzroll_list) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} -} -\description{ -If some samples, peakgroups or peaks have been dropped; peaks and peakgroups - which may now be irrelevant -} diff --git a/man/summarize_compound_pathways.Rd b/man/summarize_compound_pathways.Rd deleted file mode 100644 index 3519743..0000000 --- a/man/summarize_compound_pathways.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/systematic_db_query.R -\name{summarize_compound_pathways} -\alias{summarize_compound_pathways} -\title{Summarize compound pathways} -\usage{ -summarize_compound_pathways( - systematic_compounds, - min_pw_size = 5L, - focus_pathways = c("Glycolysis / Gluconeogenesis", "Citrate cycle (TCA cycle)", - "Pentose phosphate pathway", "Biosynthesis of amino acids", "Purine metabolism", - "Pyrimidine metabolism") -) -} -\arguments{ -\item{systematic_compounds}{The output of -\code{\link{query_systematic_compounds}}.} - -\item{min_pw_size}{Minimum number of examples of metabolites from a pathway -for the pathway not to be filtered.} - -\item{focus_pathways}{A character vector of names of KEGG pathways to be -included as \code{focus_pathway} annotations.} -} -\value{ -a tibble of systematicCompoundIds and pathway annotations -} -\description{ -Reduce compound pathway annotations to a set of core "focus" pathways and - to well-represented (>= \code{min_pw_size} examples) general pathways. -} diff --git a/man/test_mzroll_list.Rd b/man/test_mzroll_list.Rd index f5ba7ae..bbd1ebb 100644 --- a/man/test_mzroll_list.Rd +++ b/man/test_mzroll_list.Rd @@ -2,21 +2,24 @@ % Please edit documentation in R/utils.R \name{test_mzroll_list} \alias{test_mzroll_list} -\title{Test Mzroll List} +\title{Test MzRoll List} \usage{ -test_mzroll_list(mzroll_list) +test_mzroll_list(mzroll_list, fast_check = TRUE) } \arguments{ \item{mzroll_list}{output of \link{process_mzroll} or \link{process_mzroll_multi} \itemize{ - \item{peakgroups: one row per unique analyte (defined by a + \item{features: one row per unique analyte (defined by a unique groupId)}, \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} + \item{measurements: one row per peak (samples x peakgroups)} }} + +\item{fast_check}{if TRUE then skip some checks which are slow and that are +generally only needed when a \code{tomic} object is first created.} } \description{ -Test Mzroll List +Test MzRoll List } diff --git a/man/util_pretty_khead.Rd b/man/util_pretty_khead.Rd new file mode 100644 index 0000000..fa0feca --- /dev/null +++ b/man/util_pretty_khead.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{util_pretty_khead} +\alias{util_pretty_khead} +\title{Util - Pretty Knitr Head} +\usage{ +util_pretty_khead(tbl, nrows = 10, caption = NULL) +} +\arguments{ +\item{tbl}{a data.frame or tibble} + +\item{nrows}{the max number of rows to show} + +\item{caption}{The table caption.} +} +\value{ +an html knitr table +} +\description{ +Util - Pretty Knitr Head +} +\examples{ +util_pretty_khead(mtcars, nrows = 5, caption = "cars!") +} diff --git a/man/volcano_plot.Rd b/man/volcano_plot.Rd deleted file mode 100644 index 10a741c..0000000 --- a/man/volcano_plot.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/differential_expression.R -\name{volcano_plot} -\alias{volcano_plot} -\title{Volcano plot} -\usage{ -volcano_plot(regression_significance, max_p_trans = 10, FDR_cutoff = 0.1) -} -\arguments{ -\item{regression_significance}{output of diffex_mzroll} - -\item{max_p_trans}{maximum value of -log10 pvalues to plot} - -\item{FDR_cutoff}{FDR cutoff to label for significance} -} -\description{ -Volcano plot -} diff --git a/man/write_tidy_list_augmented.Rd b/man/write_tidy_list_augmented.Rd deleted file mode 100644 index ee006c4..0000000 --- a/man/write_tidy_list_augmented.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/export.R -\name{write_tidy_list_augmented} -\alias{write_tidy_list_augmented} -\title{Write flat tidy list} -\usage{ -write_tidy_list_augmented(mzroll_list, dir_path, name_preamble) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{dir_path}{path to save outputs} - -\item{name_preamble}{start of output file name} -} -\value{ -Export one table which is one row per peak, which includes all feature and - sample attributes. -} -\description{ -Write flat tidy list -} diff --git a/man/write_tidy_list_triple.Rd b/man/write_tidy_list_triple.Rd deleted file mode 100644 index 5e196e0..0000000 --- a/man/write_tidy_list_triple.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/export.R -\name{write_tidy_list_triple} -\alias{write_tidy_list_triple} -\title{Write flat tidy list} -\usage{ -write_tidy_list_triple(mzroll_list, dir_path, name_preamble) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{dir_path}{path to save outputs} - -\item{name_preamble}{start of output file name} -} -\value{ -Export three tables: -\itemize{ - \item{features: one row per features measured (i.e., a metabolite)} - \item{sample: one row per sample} - \item{measurements: one row per measurement (i.e., one metabolite in - one sample)} -} -} -\description{ -Write flat tidy list -} diff --git a/man/write_tidy_list_wide.Rd b/man/write_tidy_list_wide.Rd deleted file mode 100644 index e6c0c73..0000000 --- a/man/write_tidy_list_wide.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/export.R -\name{write_tidy_list_wide} -\alias{write_tidy_list_wide} -\title{Write wide output} -\usage{ -write_tidy_list_wide( - mzroll_list, - dir_path, - name_preamble, - value.var = "log2_abundance", - transpose = FALSE -) -} -\arguments{ -\item{mzroll_list}{output of \link{process_mzroll} or - \link{process_mzroll_multi} - -\itemize{ - \item{peakgroups: one row per unique analyte (defined by a - unique groupId)}, - \item{samples: one row per unique sample (defined by a unique sampleId)}, - \item{peaks: one row per peak (samples x peakgroups)} - }} - -\item{dir_path}{path to save outputs} - -\item{name_preamble}{start of output file name} - -\item{value.var}{which variable in "peaks" to use for quantification.} - -\item{transpose}{if TRUE then samples will be stored as rows} -} -\value{ -Export one table which contains metabolites as rows and samples as columns. -} -\description{ -abundances form a matrix with metabolites as rows and samples as columns. - Use transpose to treat samples as rows -} diff --git a/tests/testthat.R b/tests/testthat.R index 3fac22a..eaed82f 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) -library(calicomics) +library(claman) -test_check("calicomics") +test_check("claman") diff --git a/tests/testthat/test-filters.R b/tests/testthat/test-filters.R new file mode 100644 index 0000000..e976788 --- /dev/null +++ b/tests/testthat/test-filters.R @@ -0,0 +1,18 @@ +test_that("Filtering by groupId works", { + + filtered_mzroll <- filter_groupIds( + claman::nplug_mzroll_normalized, + groupIds = 1:10, + invert = FALSE + ) + + expect_equal(as.numeric(filtered_mzroll$features$groupId), 1:10) + + filtered_mzroll <- filter_groupIds( + claman::nplug_mzroll_normalized, + groupIds = 1:10, + invert = TRUE + ) + + expect_equal(sum(c(1:11) %in% filtered_mzroll$features$groupId), 1) +}) diff --git a/tests/testthat/test_mutates.R b/tests/testthat/test_mutates.R index ae07114..a348db7 100644 --- a/tests/testthat/test_mutates.R +++ b/tests/testthat/test_mutates.R @@ -4,7 +4,184 @@ test_that("Remove constant name", { name_set <- c("aaaxyzbbb", "aaaklbbb", "aaaxyzbbb") truncated_name_set <- c("xyz", "kl", "xyz") - constant_names <- calicomics::remove_constant_name(name_set) + constant_names <- claman::remove_constant_name(name_set) expect_equal(constant_names, truncated_name_set) }) + +test_that("Test batch centering", { + + centered_batches <- normalize_peaks( + claman::nplug_mzroll_augmented, + "center batches", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = "month", + centering_fxn = median + ) %>% + romic::triple_to_tidy() + + differences <- centered_batches$data %>% + dplyr::group_by(groupId, month) %>% + dplyr::summarize( + group_median = median(normalized_log2_abundance, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::group_by(groupId) %>% + dplyr::summarize(diff = diff(range(group_median)), .groups = "drop") %>% + {.$diff} + + expect_equal(differences, rep(0, length(differences))) +}) + +test_that("Test reference samples and reference conditions", { + + reference_condition_normalized <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + normalization_method = "reference condition", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + condition_varname = "condition", + reference_varname = "reference" + ) + + reference_sample_normalized <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref" + ) + + # reference condition and reference samples should just be off by + # an intercept + + compare_normalizations <- reference_condition_normalized$measurements %>% + dplyr::select( + groupId, + sampleId, + condition_normalized = normalized_log2_abundance + ) %>% + dplyr::left_join( + reference_sample_normalized$measurements %>% + dplyr::select( + groupId, + sampleId, + sample_normalized = normalized_log2_abundance, + ), + by = c("groupId", "sampleId") + ) %>% + dplyr::mutate(diff = sample_normalized - condition_normalized) + + expect_equal( + compare_normalizations$diff, + rep(0, nrow(compare_normalizations)) + ) + }) + +test_that("Flooring works and is maintained", { + + floored_peaks <- claman::floor_peaks(nplug_mzroll_augmented, 12) + + expect_equal( + floored_peaks$measurements$log2_abundance >= 12, + rep(TRUE, nrow(floored_peaks$measurements)) + ) + + median_polished <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + "median polish", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + log2_floor_value = 12 + ) + + expect_equal( + median_polished$measurements$normalized_log2_abundance >= 12, + rep(TRUE, nrow(median_polished$measurements)) + ) + + reference_condition_normalized <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + normalization_method = "reference condition", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + condition_varname = "condition", + reference_varname = "reference", + log2_floor_value = 12 + ) + + # is normalization actually doing something + expect_false(isTRUE(all.equal( + reference_condition_normalized$measurements$log2_abundance, + reference_condition_normalized$measurements$normalized_log2_abundance + ))) + + # is flooring maintained + expect_equal( + reference_condition_normalized$measurements$normalized_log2_abundance >= 12, + rep(TRUE, nrow(reference_condition_normalized$measurements)) + ) + + reference_sample_normalized <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref", + log2_floor_value = 12 + ) + + # is normalization doing something + expect_false(isTRUE(all.equal( + reference_condition_normalized$measurements$log2_abundance, + reference_condition_normalized$measurements$normalized_log2_abundance + ))) + + # is flooring maintained + expect_equal( + reference_sample_normalized$measurements$normalized_log2_abundance >= 12, + rep(TRUE, nrow(reference_sample_normalized$measurements)) + ) + + reference_sample_nofloor <- claman::normalize_peaks( + claman::nplug_mzroll_augmented, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref" + ) + # this should be identical reference_sample_normalized aside from + # group-level intercept + + inconsistencies <- reference_sample_normalized$measurements %>% + dplyr::select( + groupId, + sampleId, + abund_unfloored = normalized_log2_abundance + ) %>% + dplyr::left_join( + reference_sample_nofloor$measurements %>% + dplyr::select( + groupId, + sampleId, + abund_floored = normalized_log2_abundance + ), + by = c("groupId", "sampleId") + ) %>% + dplyr::mutate(diff = abund_unfloored - abund_floored) %>% + dplyr::group_by(groupId) %>% + dplyr::filter(all(abund_unfloored >= 12.5)) %>% + dplyr::summarize(val = diff(range(diff))) + + expect_equal( + inconsistencies$val, + rep(0, nrow(inconsistencies)) + ) +}) \ No newline at end of file diff --git a/vignettes/NPLUG.Rmd b/vignettes/NPLUG.Rmd new file mode 100644 index 0000000..8822117 --- /dev/null +++ b/vignettes/NPLUG.Rmd @@ -0,0 +1,428 @@ +--- +title: "NPLUG" +output: + html_document: + theme: simplex + toc: true + toc_depth: 2 + toc_float: true +vignette: > + %\VignetteIndexEntry{NPLUG} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.width = 10, + fig.height = 10 +) +``` + +The NPLUG (**N**itrogen, **P**hosphorous, **L**eucine, **U**racil, or **G**lucose) experiment of [Boer et al. 2010](https://pubmed.ncbi.nlm.nih.gov/19889834/) explored the impacts of nutrient limitation and growth rate on the yeast' metabolome. + +Growth conditions in chemostat culture (indefinite exponential growth) followed a full factorial experimental design, such that the 25 primary experimental conditions contained all pairs of: + +- 5 limiting nutrients: nitrogen, phosphorous, and glucose, as well as leucine and uracil in auxotrophs. +- 5 dilution rates (growth rate / ln(2)): from 0.05-0.3 hrs^-1^. + +In total, 136 biological samples were collected. Two extraction methods were used on each culture (pellet- or filter-based) and each extraction method had two technical replicates. In addition, each batch of experimental cultures was paired with a phosphate-limited slow growth (DR = 0.05 hrs^-1^) reference culture to correct for month-to-month variability. + +This study demonstrates the nature of nutrient limitation massively impacts the metabolome, with some growth-limiting metabolites becoming greatly depleted and other metabolites appearing to overflow. This vignette reproduces some of the major findings of this study using the standard functionality built into **claman**. + +*** + +# Data Import + +claman is meant to nicely interface with .mzrollDB SQLite databases produced by [MAVEN](https://github.com/eugenemel/maven), the Metabolomics Analysis and Visualization ENgine. Accordingly, we can start with a .mzrollDB file containing a stripped-down version of the NPLUG data, which bas been bundled with the claman package. + +```{r data_loading, message = FALSE} +library(claman) + +library(dplyr) +library(ggplot2) + +mzroll_db_path <- nplug_mzroll() +``` + +This database contains tables of peakgroups (ions with characteristic m/z and rt), samples, and peaks (measurements of peakgroups in each sample). It does not include meta-data about metabolites (such as metabolites' pathways), nor samples (such as the experimental design), so this information will be added later. + +## Read .mzrollDB + +```{r read_mzroll} +mzroll_list <- process_mzroll(mzroll_db_path) +``` + +```{r read_mzrolls_tables, echo = FALSE} +util_pretty_khead(mzroll_list$features, caption = "Features table") +util_pretty_khead(mzroll_list$samples, caption = "Samples table") +util_pretty_khead(mzroll_list$measurements, caption = "Measurements table") +``` + +## Add sample meta-data + +```{r read_sample_metadata} +mzroll_list_augmented <- merge_samples_tbl(mzroll_list, nplug_samples, "sample_name") +``` + +```{r read_sample_metadata_tables, echo = FALSE} +util_pretty_khead(mzroll_list_augmented$samples, 5, caption = "updated mzroll samples table") +``` + +## Add compounds meta-data + +```{r read_compounds_metadata} +mzroll_list_augmented <- merge_compounds_tbl(mzroll_list_augmented, nplug_compounds) +``` + +```{r read_compounds_metadata_tables, echo = FALSE} +util_pretty_khead(mzroll_list_augmented$features, 5, caption = "updated mzroll features table") +``` + + +# Preliminary Analysis + +Having loaded the dataset, we can take a quick look at the data using heatmaps. + +## Limiting Nutrients and Dilution Rate + +The biological variables in the experimental design are limiting nutrient and dilution rate. To look at their influence in shaping the metabolome, we can separate limiting nutrients using facets and order samples based on dilution rate (this will happen naturally if we order alphabetically by sample name). + +To make plots like this we can take advantage of the fact that mzroll_lists build on top of the [**romic**](http://www.github.com/calico/romic) *triple_omic* classes. + +```{r bio_heatmap} +romic::plot_heatmap( + mzroll_list_augmented, + feature_var = "compoundName", + sample_var = "name", + value_var = "centered_log2_abundance", + change_threshold = 5 + ) + facet_grid(~ exp_ref + limitation, scales = "free_x", space = "free_x") + + ggtitle("Metabolites separated by limiting nutrients") +``` + +From this plot, its clear that limiting nutrients have a strong role in shaping the metabolome, while dilution rate creates relatively smooth transitions as each limitation progresses from slow growth (left) to fast growth (right). + +## Extraction Method and Batch + +The technical variables in the experimental design are extraction method, month (which is likely associated with batch effects), and replication. Like the heatmap of limitation above, we can look at how extraction method and month shape the metabolome with heatmaps. + +Since limiting nutrient has a major effect, we'll want to account for it when looking extraction method's impact on metabolome variability. + +```{r extraction_heatmap} +romic::plot_heatmap( + mzroll_list_augmented, + feature_var = "compoundName", + sample_var = "name", + value_var = "centered_log2_abundance", + change_threshold = 5 + ) + facet_grid(~ limitation + extraction, scales = "free_x", space = "free_x") + + ggtitle("Metabolites separated by extraction") + + theme(strip.text = element_text(size = 10)) +``` + +From this plot, there are clear, but often relatively minor effects of extraction method on metabolite abundances. Large differences in extraction efficiency are generally seen for nucleotides, where using a filter-based extraction results in greater signal than using pellets. + +To look at month-effects, we can take advantage of the fact that in each month two biological replicate reference cultures were generated. These two cultures can be distinguished in the sample meta-data most easily by subtle differences in their dilution rate. + +```{r month_heatmap} +mzroll_list_augmented %>% + # only look at reference samples + romic::filter_tomic( + filter_type = "category", + filter_table = "samples", + filter_variable = "exp_ref", + filter_value = "ref" + ) %>% + romic::plot_heatmap( + feature_var = "compoundName", + sample_var = "name", + value_var = "centered_log2_abundance", + change_threshold = 5 + ) + facet_grid(~ month + DR, scales = "free_x", space = "free_x") + + ggtitle("References separated by month and distinct cultures (by DR)") + + theme(strip.text = element_text(size = 10)) +``` +From this plot, the biological replicate reference cultures are nearly identical (that's one reason why chemostats are awesome!) while there are subtle differences between months. + +# Normalization + +From the preliminary analysis, when normalizing this dataset to remove sources of technical variability (and thereby clarify the biological signal), we'll want to account of month-to-month variability, extraction methodology, and technical replication. + +To tackle these obstacles we will: + +1. compare and then collapse technical replicates. +2. contrast experimental conditions with reference conditions to account for month-to-month variability. +3. determine whether one extraction method is superior or whether both can be used for downstream analysis with appropriate normalization. + +## Collapsing Technical Replicates + +```{r technical_replicates} +plot_compare_injection( + mzroll_list_augmented, + grouping_vars = "condition", + peak_quant_var = "centered_log2_abundance" + ) + ggtitle("Comparison of injection technical replicates") + +mzroll_list_distinct_conditions <- collapse_injections( + mzroll_list_augmented, + grouping_vars = "condition", + peak_quant_vars = c("log2_abundance", "centered_log2_abundance"), + collapse_fxn = "mean" + ) +``` + +```{r technical_replicates_table, echo = FALSE} +util_pretty_khead( + mzroll_list_distinct_conditions$samples, + caption = "samples aggregated over injections" +) +``` + +## Contrasting to a Reference Condition + +Since experimental cultures of the same limiting nutrient were generated simultaneously, limitation and the month-of-generation are confounded. Without correcting this confounding, some of the month effects would be interpreted as limitation-specific signals. To correct for this month-to-month variability in instrument performance, each experimental sample can be compared to common reference conditions. Since these cultures possess very similar biological signals, but should share the performance biases, contrasting each biological culture with a month-matched reference should remove this shared bias. In a similar fashion we can evaluate whether differences due to extraction methodology are resolved when we are comparing an experimental condition to its extraction-matched, month-matched reference. + +```{r normalization} +mzroll_list_normalized <- normalize_peaks( + mzroll_list_distinct_conditions, + normalization_method = "reference sample", + quant_peak_varname = "log2_abundance", + norm_peak_varname = "normalized_log2_abundance", + batch_varnames = c("month", "extraction"), + reference_varname = "exp_ref", + reference_values = "ref" + ) %>% + # having normalized by the common reference, we can re-center the data + # since slow-phosphate limited growth is not a biological reference. + romic::center_tomic(measurement_vars = "normalized_log2_abundance") +``` + +Have normalized for month and extraction method, we can look at the results as another heatmap. + +```{r post_normalized_heatmap} +romic::plot_heatmap( + mzroll_list_normalized, + feature_var = "compoundName", + sample_var = "name", + value_var = "normalized_log2_abundance", + change_threshold = 5, + cluster_dim = "rows" + ) + facet_grid(~ exp_ref + limitation, scales = "free_x", space = "free_x") + + ggtitle("Post-normalization metabolite abundances") + + theme(strip.text = element_text(size = 10)) +``` + +From this visualization, we can note that the reference chemostats are uniform for a given peakgroup (these were set to zero during normalization and then offset during re-centering). There does seem to be an issue with the attempted extraction normalization though; having compared each experimental sample to its extraction method matched reference, meaningful differences relative abundance associated with extraction remain. + +## Accounting for Extraction Method + +While we could try to make use of data from both extraction methods, as they did in the original paper, due to the inconsistent extraction efficiency, I think its better to just retain the higher-signal filter-extraction samples. At this point we can also discard the reference samples since they're no longer needed. + +```{r extraction_correction} +final_processed_data <- mzroll_list_normalized %>% + # retain just experimental samples + romic::filter_tomic( + filter_type = "category", + filter_table = "samples", + filter_variable = "exp_ref", + filter_value = "exp" + ) %>% + # retain only filter extraction + romic::filter_tomic( + filter_type = "category", + filter_table = "samples", + filter_variable = "extraction", + filter_value = "filter" + ) + +# clean-up sample data +renamed_samples <- final_processed_data$samples %>% + select(sampleId, limitation, DR) %>% + mutate(name = glue::glue( + "{stringr::str_sub(limitation, 1, 1)}{round(DR,2)}" + )) %>% + group_by(name) %>% + mutate( + name = case_when(n() == 1 ~ name, + TRUE ~ paste0(name, "-", 1:n())) + ) %>% + ungroup() + +final_processed_data <- romic::update_tomic( + final_processed_data, + renamed_samples + ) + +romic::plot_heatmap( + final_processed_data, + feature_var = "compoundName", + sample_var = "name", + value_var = "normalized_log2_abundance", + change_threshold = 5, + cluster_dim = "rows" + ) + facet_grid(~ limitation, scales = "free_x", space = "free_x") + + ggtitle("Final processed metabolite abundances across 25 growth conditions") + + theme(strip.text = element_text(size = 10)) +``` + +# Differential Abundance Testing + +Having generated a processed dataset, we can already see some patterns clearly by looking at the heatmap, such as bottlenecking of pyrimidine biosynthesis during uracil limitation, amino acid accumulation during leucine limitation, and TCA up-regulation during ammonium limitation. Its also apparent that many of the strongest metabolomic alterations are observed at slow growth rates, while these there are relatively smooth transitions back to a normalish-metabolome as cell growth more quickly. To quantitatively summarize trends like these, its helpful to carryout differential abundance testing. But, before we proceed we should determine the type of model we need to fit by interpreting the major sources of variation in our dataset in light of the experimental design. + +## Exploratory Data Analysis (EDA) + +Our analysis thus far has already corrected for major sources of variation in this original dataset, such as month-to-month variation and differences in extraction methodology. If we hadn't controlled for those factors by this point, we would want to create visualizations which appropriately identify major unaccounted for sources of variation. Similarly, this will help us to determine how we should represent our experimental design as an appropriate hypothesis test. + +Two visualizations that are particularly useful for EDA are heatmaps (I'm sure you've seen enough of those), and principal components plots which overlay elements of the experimental design on the leading principal components. + +```{r static_eda} +samples_with_pcs <- final_processed_data %>% + romic::add_pca_loadings(value_var = "normalized_log2_abundance", npcs = 5) + +romic::plot_bivariate( + samples_with_pcs$samples, + "PC1", + "PC2", + color_var = "limitation" + ) + ggtitle("Top principal components driving metabolomic variation") +``` + +One plot doesn't quite cut it here. Luckily, romic has some powerful methods for shiny-based interactive analysis that we can leverage here to quickly make plots like these. + +```{r eda, eval = FALSE} +romic::app_flow(samples_with_pcs) + +romic::app_heatmap(final_processed_data %>% romic::triple_to_tidy()) +``` + +## Hypothesis Testing + +From the results thus far, we've seen that limitation greatly impacts the metabolome, as does growth rate which tends to have a limitation-specific trend. Whatever model we fit should account for these features of the data. The actual model we fit will depend on the question we want to ask of the data. For example, we could be interested in just finding metabolites impacted by nutrient, or those that are perturbed by a specific nutrient. To keep things manageable, I'll explore how nitrogen-limitation alters the metatbolome. + +To ask this, we can formulate a linear regression of the form: + +$$ +y^{\text{normalized}} \sim \text{lim} + \text{lim} \cdot \text{DR} +$$ + +```{r hypothesis_testing} +regression_significance <- diffex_mzroll( + final_processed_data, + "normalized_log2_abundance", + "limitation + limitation:DR + 0" + ) + +plot_pvalues(regression_significance) + +plot_volcano(regression_significance) +``` + +## Plotting Individual Metabolites + +```{r met_examples, fig.width = 6, fig.height = 6} +n_lim_signif <- regression_significance %>% + filter(term == "limitationNH4") %>% + arrange(qvalue) %>% + left_join( + final_processed_data$features %>% + select(groupId, peak_label), + by = "groupId") +``` + +```{r met_examples_table, echo = FALSE} +util_pretty_khead( + n_lim_signif, + caption = "Metabolites that most perturbed by nitrogen limitation" +) +``` + +```{r met_examples_barplot} +plot_barplot( + mzroll_list = final_processed_data, + groupIds = n_lim_signif$groupId[1:4], + grouping_vars = c("limitation", "DR"), + value_var = "normalized_log2_abundance", + fill_var = "limitation" + ) + scale_fill_brewer(palette = "Set2") +``` + +## Pathway Analysis + +```{r pathway_enrichment} +pathway_nest <- final_processed_data$features %>% + dplyr::select(groupId, pathway) %>% + tidyr::nest(pathway_members = groupId) + +pathway_list <- purrr::map( + pathway_nest$pathway_members, + function(x) { + as.character(x$groupId) + } + ) +names(pathway_list) <- pathway_nest$pathway + +enrichments <- find_pathway_enrichments( + final_processed_data, + regression_significance, + pathway_list, + test_absolute_effects = FALSE + ) +``` + +### Generate tables and plots of top enrichments + +```{r fgsea_table, echo=FALSE} +enrichments$enrichment_table %>% + arrange(padj) %>% + select(-pathway_members, -enrichment_plot) %>% + util_pretty_khead(caption = "Top pathway enrichments across all regression terms") +``` + +```{r top_enrichments} +plot(enrichments$enrichment_plots[["limitationNH4"]]) + +# see specific pathway enrichments +ranked_nitrogen_enrichments <- enrichments$enrichment_table %>% + arrange(padj) %>% + filter(term == "limitationNH4") + +ranked_nitrogen_enrichments$enrichment_plot[[1]] + + ggtitle( + "Amino acids are depleted during nitrogen limitation", + "Note that the x-axis is ranks, and negative values have higher ranks" + ) +``` + +# Export results + +While self-contained reproducible notebooks like this vignette are an ideal way to share results with collaborators, normalized data and analysis artifacts can easily be exported as well. This is done using romic functions: *export_tomic_as_tidy*, *export_tomic_as_triple*, and *export_tomic_as_wide*. These three exports write the same comprehensive summaries of an mzroll to disk just in different formats. Before exporting, we may also want to include additional feature and sample attributes that have not been directly incorporated into the mzroll_list such as summaries of differential expression. + +```{r add_diffex} +wide_stats <- regression_significance %>% + select(groupId, term, diffex_label) %>% + tidyr::spread(term, diffex_label) + +final_processed_data <- romic::update_tomic( + final_processed_data, + final_processed_data$features %>% + left_join(wide_stats, by = "groupId") +) +``` + +```{r add_diffex_table, echo = FALSE} +util_pretty_khead( + final_processed_data$features, + caption = "Peakgroups with differential abundance statistics added" + ) +``` + +```{r export_results, eval = FALSE} +romic::export_tomic_as_tidy( + final_processed_data, + dir_path = "/tmp", + name_preamble = "nplug" + ) +``` diff --git a/vignettes/metabolomics_example.Rmd b/vignettes/metabolomics_example.Rmd deleted file mode 100644 index 6ac8bde..0000000 --- a/vignettes/metabolomics_example.Rmd +++ /dev/null @@ -1,176 +0,0 @@ ---- -title: "Metabolomics Example" -author: "Sean Hackett" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Metabolomics Example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - width = 80 -) - -knitr::opts_chunk$set(eval = FALSE) -``` - -## One time configuration - -- run configure_google_access() interactively -- add db_password and GITHUB_PAT to .Renviron - -## Pre-processing - -```{r imports, message = FALSE, warning = FALSE} -library(authutils) -library(calicomics) -library(dplyr) - -# this will configure access to googledrive and googlesheets (initially run interactively, the browser may open so you can generate a code to put in the Rstudio console) -authutils::configure_google_access() -# connect to standards and systematic databases -standard_databases <- configure_db_access() -``` - -# Workflows - -## Simple - one method, no sample sheet - -```{r inputs_single, message = FALSE, fig.height = 15, fig.width = 10} -mzroll_db_path <- authutils::get_clamr_assets("X0083-M001A.mzrollDB") -# For your data, directly add the path, like this: -# mzroll_db_path <- "/peakDetector.mzrollDB" - -# read and format data -mzroll_list <- process_mzroll(mzroll_db_path, - standard_databases = standard_databases, - method_tag = "M001A", - only_identified = TRUE) - -mzroll_list$samples <- mzroll_list$samples %>% - dplyr::mutate(sampleName = calicomics::remove_constant_name(name)) - -plot_heatmap(mzroll_list, sample.var = "sampleName", feature.var = "peak_label", change_threshold = 5) -``` - -## Sinister - multiple methods, Googlesheets sample sheet & multiple outputs - -### Setup mzroll_list - -```{r sample_sheets, message = FALSE} -# read sample meta-data -sample_sheet_list <- import_sample_sheet(pattern = "X0106") -``` - -```{r inputs_multi, message = FALSE, warning = FALSE} -# download example mzrolls from Google Drive -mzroll_db_dir_path <- authutils::get_clamr_assets("X0106_mzrollDBs.zip") -# paths to /tmp folder are saved in calicomics -mzroll_paths <- calicomics::X0106_paths - -mzroll_list <- process_mzroll_multi(mzroll_paths, - standard_databases = standard_databases, - sample_sheet_list, - only_identified = TRUE) -``` - -### Processing - -- filter to peakgroups of interest - -```{r filter} -mzroll_list <- mzroll_list %>% - calicomics::filter_peakgroups_quo(quo(searchTableName %in% c("clamDB", "Bookmarks"))) -``` - -- floor data (if desired) and/or subtract blanks - -```{r} -mzroll_list <- floor_peaks(mzroll_list, 12) -``` - -- aggregating across multiple methods -- interpretting experimental design from sample sheet -- sample_sheet_list - -### Exploratory Analysis - -```{r exploratory_analysis, fig.height = 15, fig.width = 10} -plot_heatmap(mzroll_list, sample.var = "sampleId", feature.var = "groupId", change_threshold = 5, plot_type = "grob") -``` - -### Statistical Analysis - -- Using sample sheet to look for interesting metabolite variation - -#### Regression - -```{r regression, fig.height = 10, fig.width = 10} -# generating regression_significance requires installation of the suggested package qvalue, which can be installed by executing the command: -# -# > install.packages("remotes) -# > remotes::install_bioc("qvalue") -mzroll_list$samples <- mzroll_list$samples %>% - dplyr::mutate(treatment = factor(treatment, levels = c("vehicle", "4916")), - age = factor(age, levels = c("young", "old"))) - -regression_significance <- diffex_mzroll(mzroll_list, value.var = "centered_log2_abundance", test_model = "age * treatment", null_model = NULL) - -volcano_plot(regression_significance, max_p_trans = 10, FDR_cutoff = 0.1) - -pvalue_histograms(regression_significance) -``` - -#### ANOVA - -```{r anova, fig.height = 10, fig.width = 10, warning = FALSE} -anova_significance <- diffex_mzroll(mzroll_list, value.var = "centered_log2_abundance", test_model = "age * treatment", null_model = "age + treatment") - -pvalue_histograms(anova_significance) -``` - -### Summary of Results - -```{r barplots, fig.height = 10, fig.width = 10} -regression_significance %>% - arrange(qvalue) %>% - slice(1:6) %>% - {.$groupId} %>% - plot_barplot(mzroll_list = mzroll_list) -``` - -#### Pathway Enrichments - -For each term in the regression/ANOVA analysis, find pathways which are enriched among elevated or depleted metabolites. Currently only KEGG pathways are supported. - -```{r pathway_enrichments} -enriched_pathways <- pathway_enrichments(mzroll_list, - significance = regression_significance %>% - dplyr::filter(term != "(Intercept)"), - standard_databases, - test_absolute_effects = FALSE) - -# generate a table of top enrichments - -enriched_pathway_table <- enriched_pathways$enrichment_table %>% - dplyr::arrange(padj) - -enriched_pathway_table %>% - dplyr::select(term, pathway, pval, padj) %>% - DT::datatable() - -# see a high-level summary of significant pathways for a given term - -plot(enriched_pathways$enrichment_plots$`ageold:treatment4916`) - -# see specific pathway enrichments - -plot(enriched_pathway_table$enrichment_plot[[1]]) + ggtitle("Bile secretion metabolites decreased in old treated animals") -``` - -