From 5661eca4e63d9b9ab27724d07543e102ccc61817 Mon Sep 17 00:00:00 2001
From: Johanna Fleischman <71276903+johanna0321@users.noreply.github.com>
Date: Tue, 14 May 2024 10:38:12 -0700
Subject: [PATCH] Add zero centering and lm drift normalization functions

---
 DESCRIPTION                    |  2 +-
 R/mutate_mzroll_list.R         | 98 +++++++++++++++++++++++++++++++++-
 man/calicomics.Rd              | 17 ++++++
 man/normalize_peaks.Rd         | 18 +++++++
 man/nplug_compounds.Rd         |  2 +-
 man/nplug_mzroll_augmented.Rd  |  2 +-
 man/nplug_mzroll_normalized.Rd |  2 +-
 man/nplug_samples.Rd           |  4 +-
 8 files changed, 138 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1ae5581..1aef0e8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -66,5 +66,5 @@ Suggests: dbplyr,
     testthat
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 VignetteBuilder: knitr
diff --git a/R/mutate_mzroll_list.R b/R/mutate_mzroll_list.R
index a066244..6fb90f2 100644
--- a/R/mutate_mzroll_list.R
+++ b/R/mutate_mzroll_list.R
@@ -183,12 +183,16 @@ fill_in_missing_peaks <- function(mzroll_list,
 #'     value
 #'     }
 #'   \item{\code{center batches}: batch centering}
+#'   \item{\code{center}: zero-center each compound}
 #'   \item{\code{reference sample}: compare to reference sample}
 #'   \item{\code{reference condition}: compare each sample to its specified
 #'     reference sample}
 #'   \item{\code{loess}: weighted smoothing of IC over time (adds
 #'     \code{.loess_fit} as a peaks variable in addition to
 #'     \code{norm_peak_varname})}
+#'   \item{\code{lm}: linear regression smoothing of IC over time (adds
+#'     \code{lm_estimate} as feature variable in addition to
+#'     \code{norm_peak_varname})}
 #'   }
 #' @param quant_peak_varname variable in measurements to use for abundance
 #' @param norm_peak_varname variable in measurements to add for normalized
@@ -269,7 +273,9 @@ normalize_peaks <- function(mzroll_list,
     "center batches", "normalize_peaks_batch_center",
     "reference sample", "normalize_peaks_reference_sample",
     "reference condition", "normalize_peaks_reference_condition",
-    "loess", "normalize_peaks_loess"
+    "loess", "normalize_peaks_loess",
+    "lm", "normalize_peaks_lm",
+    "center","normalize_peaks_center"
   )
   
   checkmate::assertChoice(
@@ -901,4 +907,94 @@ normalization_refloor <- function(normalized_peaks,
   }
   
   return(normalized_peaks)
+}
+
+#' @inheritParams normalize_peaks_batch_center
+#' @param time_col_varname variable in samples table to use for linear 
+#' correction
+#'
+#' @rdname normalize_peaks
+normalize_peaks_lm <- function (mzroll_list,
+                                quant_peak_varname,
+                                norm_peak_varname, 
+                                time_col_varname) 
+{
+  stopifnot(time_col_varname %in% colnames(mzroll_list$samples))
+  
+  time_col_varname_add <- mzroll_list$samples %>%
+    dplyr::select(c("sampleId", time_col_varname))
+  
+  lm_fit <- mzroll_list$measurements %>% 
+    dplyr::left_join(., time_col_varname_add, by = "sampleId") %>%
+    tidyr::nest(groupData = -groupId) %>% 
+    dplyr::mutate(lm_fits = purrr::map(groupData,
+                                       fit_lm, 
+                                       quant_peak_varname = quant_peak_varname, 
+                                       norm_peak_varname = norm_peak_varname,
+                                       time_col_varname = time_col_varname)) %>% 
+    dplyr::select(-groupData) %>% 
+    tidyr::unnest(lm_fits)
+  
+  lm_fit_measurements <- lm_fit %>% 
+    dplyr::select(!!!rlang::syms(c(colnames(mzroll_list$measurements), norm_peak_varname)))
+  
+  lm_fit_features <- lm_fit %>%
+    dplyr::select(groupId, lm_estimate) %>%
+    unique() %>%
+    dplyr::left_join(mzroll_list$features, ., by = "groupId")
+  
+  mzroll_list <- romic::update_tomic(mzroll_list, lm_fit_measurements)
+  mzroll_list <- romic::update_tomic(mzroll_list, lm_fit_features)
+  
+  return(mzroll_list)
+  
+}
+
+fit_lm <- function (groupData, 
+                    time_col_varname,
+                    quant_peak_varname, 
+                    norm_peak_varname,
+                    order = 1) 
+{
+  
+  lm_data <- groupData %>%
+    dplyr::mutate(val_var = !!rlang::sym(quant_peak_varname),
+                  dri_var = !!rlang::sym(time_col_varname)) 
+  
+  lm_predict <- lm_data %>%
+    tidyr::drop_na(val_var)
+  
+  # Run model
+  lm_model <- stats::lm(val_var ~ poly(dri_var, degree = order), data = lm_predict)
+  
+  # Compute corrected values
+  lm_apply <- lm_data %>%
+    dplyr::mutate(median_value = median(.data$val_var, na.rm = T)) %>%
+    dplyr::mutate(`:=`(!!rlang::sym(norm_peak_varname), 
+                       .data$val_var - .env$predict(lm_model, newdata = lm_data) + .data$median_value)) %>%
+                    dplyr::mutate(lm_estimate = summary(.env$lm_model)$coefficient[2,1]) %>%
+                    dplyr::select(-c(val_var, dri_var, median_value))
+                  
+  return(lm_apply)
+  
+}
+
+
+#' @inheritParams normalize_peaks_center
+#'
+#' @rdname normalize_peaks
+normalize_peaks_center <- function(mzroll_list,
+                                   quant_peak_varname,
+                                   norm_peak_varname) 
+{
+  
+  updated_measurements <- mzroll_list$measurements %>%
+    dplyr::group_by(groupId) %>%
+    dplyr::mutate(!!rlang::sym(norm_peak_varname) :=
+        scale(!!rlang::sym(quant_peak_varname), scale = F, center = T))
+  
+  mzroll_list <- romic::update_tomic(mzroll_list, updated_measurements)
+  
+  return(mzroll_list)
+  
 }
\ No newline at end of file
diff --git a/man/calicomics.Rd b/man/calicomics.Rd
index 229a27a..8b98604 100644
--- a/man/calicomics.Rd
+++ b/man/calicomics.Rd
@@ -2,6 +2,8 @@
 % Please edit documentation in R/claman.R
 \docType{package}
 \name{calicomics}
+\alias{claman}
+\alias{claman-package}
 \alias{calicomics}
 \title{\code{claman} package}
 \description{
@@ -11,3 +13,18 @@ can then be modified through normalization, signal flooring, and filtering.
 Differential abundance analysis of metabolites can then be performed along
 with a variety of visualizations.
 }
+\author{
+\strong{Maintainer}: Sean Hackett \email{sean@calicolabs.com} (\href{https://orcid.org/0000-0002-9553-4341}{ORCID})
+
+Authors:
+\itemize{
+  \item Phil Seitzer \email{phillipseitzer@calicolabs.com}
+  \item Alireza Delfarah \email{delfarah@calicolabs.com}
+}
+
+Other contributors:
+\itemize{
+  \item Calico Life Sciences LLC [copyright holder, funder]
+}
+
+}
diff --git a/man/normalize_peaks.Rd b/man/normalize_peaks.Rd
index d5fb822..6d13c1b 100644
--- a/man/normalize_peaks.Rd
+++ b/man/normalize_peaks.Rd
@@ -8,6 +8,8 @@
 \alias{normalize_peaks_reference_sample}
 \alias{normalize_peaks_reference_condition}
 \alias{normalize_peaks_loess}
+\alias{normalize_peaks_lm}
+\alias{normalize_peaks_center}
 \title{Normalize Peaks}
 \usage{
 normalize_peaks(
@@ -68,6 +70,15 @@ normalize_peaks_loess(
   weights_tribble = NULL,
   log2_floor_value = 12
 )
+
+normalize_peaks_lm(
+  mzroll_list,
+  quant_peak_varname,
+  norm_peak_varname,
+  time_col_varname
+)
+
+normalize_peaks_center(mzroll_list, quant_peak_varname, norm_peak_varname)
 }
 \arguments{
 \item{mzroll_list}{output of \link{process_mzroll} or
@@ -87,12 +98,16 @@ normalize_peaks_loess(
     value
     }
   \item{\code{center batches}: batch centering}
+  \item{\code{center}: zero-center each compound}
   \item{\code{reference sample}: compare to reference sample}
   \item{\code{reference condition}: compare each sample to its specified
     reference sample}
   \item{\code{loess}: weighted smoothing of IC over time (adds
     \code{.loess_fit} as a peaks variable in addition to
     \code{norm_peak_varname})}
+  \item{\code{lm}: linear regression smoothing of IC over time (adds
+    \code{lm_estimate} as feature variable in addition to
+    \code{norm_peak_varname})}
   }}
 
 \item{quant_peak_varname}{variable in measurements to use for abundance}
@@ -121,6 +136,9 @@ reference samples}
 
 \item{weights_tribble}{a table containing weights and sample variables to
 match them to.}
+
+\item{time_col_varname}{variable in samples table to use for linear 
+correction}
 }
 \value{
 a \code{mzroll_list} as generated by \code{\link{process_mzroll}}
diff --git a/man/nplug_compounds.Rd b/man/nplug_compounds.Rd
index b1e7434..aad9a39 100644
--- a/man/nplug_compounds.Rd
+++ b/man/nplug_compounds.Rd
@@ -21,9 +21,9 @@ A table of meta-data for all NPLUG features with variables:
 }
 \seealso{
 Other nplug: 
+\code{\link{nplug_mzroll}()},
 \code{\link{nplug_mzroll_augmented}},
 \code{\link{nplug_mzroll_normalized}},
-\code{\link{nplug_mzroll}()},
 \code{\link{nplug_samples}}
 }
 \concept{nplug}
diff --git a/man/nplug_mzroll_augmented.Rd b/man/nplug_mzroll_augmented.Rd
index ab63202..e8b9f64 100644
--- a/man/nplug_mzroll_augmented.Rd
+++ b/man/nplug_mzroll_augmented.Rd
@@ -17,8 +17,8 @@ nplug_mzroll_augmented
 \seealso{
 Other nplug: 
 \code{\link{nplug_compounds}},
-\code{\link{nplug_mzroll_normalized}},
 \code{\link{nplug_mzroll}()},
+\code{\link{nplug_mzroll_normalized}},
 \code{\link{nplug_samples}}
 }
 \concept{nplug}
diff --git a/man/nplug_mzroll_normalized.Rd b/man/nplug_mzroll_normalized.Rd
index a2e32f5..574bea8 100644
--- a/man/nplug_mzroll_normalized.Rd
+++ b/man/nplug_mzroll_normalized.Rd
@@ -20,8 +20,8 @@ nplug_mzroll_normalized
 \seealso{
 Other nplug: 
 \code{\link{nplug_compounds}},
-\code{\link{nplug_mzroll_augmented}},
 \code{\link{nplug_mzroll}()},
+\code{\link{nplug_mzroll_augmented}},
 \code{\link{nplug_samples}}
 }
 \concept{nplug}
diff --git a/man/nplug_samples.Rd b/man/nplug_samples.Rd
index 620afc6..494ec45 100644
--- a/man/nplug_samples.Rd
+++ b/man/nplug_samples.Rd
@@ -39,9 +39,9 @@ A table of meta-data for all NPLUG samples with variables:
 \seealso{
 Other nplug: 
 \code{\link{nplug_compounds}},
+\code{\link{nplug_mzroll}()},
 \code{\link{nplug_mzroll_augmented}},
-\code{\link{nplug_mzroll_normalized}},
-\code{\link{nplug_mzroll}()}
+\code{\link{nplug_mzroll_normalized}}
 }
 \concept{nplug}
 \keyword{datasets}