.Rhistory

control = d$sample_meta$is_control,
group = d$sample_meta$group,
Timepoint = as.factor(d$sample_meta$timepoint),
sex = str_to_title(d$sample_meta$sex),
Phase = str_to_title(d$sample_meta$phase),
stringsAsFactors = F)
# Pull the names of exercise groups that are present in both Pass1a and 1c
comb_names = as.data.frame.matrix(table(plot_df$group,plot_df$Phase)) %>%
filter_all(all_vars(. != 0)) %>% rownames(.)
plot_df <- plot_df %>% filter(group %in% comb_names)
if(!("Pass1c" %in% plot_df$Phase)){next}
p = ggplot(plot_df) +
geom_point(aes(x = PC1, y = PC2, color = group, shape = Phase)) +
xlab(paste0("PC1 (",round(explained_var["PC1"]*100, 2), " %)")) +
ylab(paste0("PC2 (",round(explained_var["PC2"]*100, 2), " %)")) +
facet_wrap(~ interaction(sex)) +
scale_colour_brewer(palette = "Set1") +
ggtitle(plot_title) + theme_bw() + theme(aspect.ratio=1)
plot(p)
}
}
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("sample_order","raw_intensity","num_NAs")
biospec_cols = c("sex","group","phase","time_to_freeze",
"is_control","timepoint")
pcs_vs_qc_var_report = c()
meta_var_assoc_report = c()
for(currname in names(metabolomics_processed_datasets)) {
d = metabolomics_processed_datasets[[currname]]
if(nrow(d$normalized_data[[1]]) < 5){next}
# Combine all unique qc and biological variables into one vector
# that are also present in the sample_meta dataframe, remove text groups
curr_meta_cols = union(biospec_cols, pipeline_qc_cols)
curr_meta_cols = intersect(curr_meta_cols, names(d$sample_meta))
curr_meta = d$sample_meta[,curr_meta_cols] %>%
mutate(sex = as.numeric(as.factor(sex))) %>%
mutate(phase = as.numeric(as.factor(phase))) %>%
mutate(is_control = as.numeric(as.factor(is_control))) %>%
select_if(is.numeric)
# Remove metadata variables with NAs or zero variance
curr_meta = curr_meta[,!apply(is.na(curr_meta), 2, any)]
curr_meta = curr_meta[,apply(curr_meta, 2, sd) > 0]
# Get the PCA data and variances
curr_pcax = d$pca$pcax
curr_variances = d$pca$explained_var
# First, compute correlations between metadata and PCA data
corrs = cor(curr_pcax, curr_meta, method="spearman")
corrsp = pairwise_eval(curr_pcax, curr_meta, func = pairwise_association_wrapper, f = 1)
# Make correlation plot between metadata and PCA data
corr_plot =
ggcorrplot((corrs), lab=T, lab_size=2.5, hc.order = F) +
ggtitle(currname) +
scale_x_discrete(breaks = rownames(corrs),
labels = paste0(rownames(corrs), " (",round(curr_variances[rownames(corrs)]*100, 1), " %)")) +
theme(plot.title = element_text(hjust = 0.5, size=20))
print(corr_plot)
# Add data to the report
# Only saves correlations with p-values below designated threshold p_thr
for(pc in 1:nrow(corrsp)) {
for(meta_var in 1:ncol(corrsp)) {
if(corrsp[pc,meta_var] > p_thr) {next}
pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
c(currname,
rownames(corrsp)[pc],
colnames(corrsp)[meta_var],
corrs[pc,meta_var],
corrsp[pc,meta_var]))
}
}
# Second, compute correlations between all metadata
curr_meta = curr_meta[!apply(is.na(curr_meta), 1, any),]
corrs = cor(curr_meta, method = "spearman")
corrsp = pairwise_eval(curr_meta, func = pairwise_association_wrapper, f = 1)
# Make correlation plot between metadata and PCA data
print(ggcorrplot(corrs, lab=T, lab_size=2.5, hc.order = F) + ggtitle(currname))
# Add data to the report. This loop only saves p-values that are below the
# designated threshold p_thr and that are computed between bio specimen metadata
# and pipeline QC metrics
for(meta_var1 in rownames(corrsp)){
for(meta_var2 in rownames(corrsp)){
if(meta_var1 == meta_var2){break}
if(meta_var1 %in% biospec_cols &&
meta_var2 %in% biospec_cols) {next}
if(corrsp[meta_var1,meta_var2] > p_thr) {next}
meta_var_assoc_report = rbind(meta_var_assoc_report,
c(currname,
meta_var1,
meta_var2,
corrs[meta_var1,meta_var2],
corrsp[meta_var1,meta_var2]))
}
}
}
# Format the reports for a nicer presentation in a table
pcs_vs_qc_var_report = data.frame(pcs_vs_qc_var_report) %>%
mutate_at(c(4,5), .funs = ~format(as.numeric(.), digits = 3))
meta_var_assoc_report = data.frame(meta_var_assoc_report) %>%
mutate_at(c(4,5), .funs = ~format(as.numeric(.), digits = 3))
# Set rownames for table output
if(!is.null(pcs_vs_qc_var_report)){
colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
"qc_metric","rho(spearman)","p-value")
}
if(!is.null(dim(meta_var_assoc_report))){
colnames(meta_var_assoc_report) = c(
"Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
}
pca_outliers_report = c()
for(currname in names(metabolomics_processed_datasets)){
d = metabolomics_processed_datasets[[currname]]
if(nrow(d$normalized_data[[1]]) < 5) {next}
# Get PCA values and variances for the pre-defined number of PCs
# delineated for outlier analysis
curr_pcax = d$pca$pcax[,1:num_pcs_for_outlier_analysis]
curr_pcavar = d$pca$explained_var[1:num_pcs_for_outlier_analysis]
pca_outliers = c()
# Get the outlier values using the IQR threshold delineated for each PC
for(pc in colnames(curr_pcax)){
outlier_values <- boxplot.stats(curr_pcax[,pc], coef = OUTLIER_IQR_THR)$out
# Save individual outliers with their information in the report
for(outlier in names(outlier_values)){
pca_outliers_report = rbind(pca_outliers_report,
c(currname, pc, curr_pcavar[pc], outlier,
format(outlier_values[outlier], digits=5)))
# Store outlier from individual PC in combined vector from all PCs
# for outlier visualization
if(!is.element(outlier, names(pca_outliers))){
pca_outliers[outlier] = outlier_values[outlier]
}
}
}
# Plot the outliers
if(length(pca_outliers) > 0) {
# Add a column delineating which samples are outliers
plot_df = data.frame(curr_pcax) %>%
mutate(outliers = rownames(curr_pcax) %in% names(pca_outliers)) %>%
mutate(col = if_else(outliers == T, "green","black"))
plot(plot_df$PC1, plot_df$PC2, pch = as.numeric(plot_df$outliers),
col = plot_df$col, lwd = 2, cex = 1,
main = paste(currname,"flagged outliers"),
xlab = paste0("PC1 (",round(curr_pcavar["PC1"]*100, 1), " %)"),
ylab = paste0("PC2 (",round(curr_pcavar["PC2"]*100, 1), " %)"))
}
}
# Re-organize outliers report for sample report printing
colnames(pca_outliers_report) =  c("Dataset","PC","Variance","Sample","Score")
flagged_sample_report = data.frame(pca_outliers_report) %>%
select(Dataset, PC, Sample) %>%
group_by(Dataset, Sample) %>%
reframe(PC = paste(PC, collapse = ", "))
kable(flagged_sample_report, longtable = T,
caption = "Outliers detected by tissue PCA data") %>%
kable_styling(font_size = 8, latex_options = c("hold_position", "repeat_header"))
#
# Complete the 'plusMinus' function below.
#
# The function accepts INTEGER_ARRAY arr as parameter.
#
plusMinus <- function(arr) {
full_length <- length(arr)
positive <- length(arr[arr>0])
negative <- length(arr[arr<0])
zero <- length(arr[arr==0])
a <- positive/full_length
b <- negative/full_length
c <- zero/full_length
message(paste0(a,"\n",b,"\n",c,"\n"))
}
arr <- as.integer(c(-4, 3, -9, 0, 4, 1))
plusMinus(arr)
close(stdin)
#
plusMinus <- function(arr) {
full_length <- length(arr)
positive <- length(arr[arr>0])
negative <- length(arr[arr<0])
zero <- length(arr[arr==0])
a <- positive/full_length
b <- negative/full_length
c <- zero/full_length
message(paste0(a,"\n",b,"\n",c,"\n"))
}
arr <- as.integer(c(-4, 3, -9, 0, 4, 1))
plusMinus(arr)
full_length <- length(arr)
positive <- length(arr[arr>0])
negative <- length(arr[arr<0])
zero <- length(arr[arr==0])
a <- positive/full_length
b <- negative/full_length
c <- zero/full_length
a <- round(positive/full_length,6)
?sprintf
a <- sprintf(fmt, "%.06d", positive/full_length)
a <- sprintf(fmt= "%.06d", positive/full_length)
a <- sprintf(fmt= "%d.06", positive/full_length)
a <- sprintf(fmt= "%.06f", positive/full_length)
a <- sprintf(fmt= "%.06f", positive/full_length)
b <- sprintf(fmt= "%.06f", negative/full_length)
c <- sprintf(fmt= "%.06f", zero/full_length)
message(paste0(a,"\n",b,"\n",c,"\n"))
plusMinus <- function(arr) {
full_length <- length(arr)
positive <- length(arr[arr>0])
negative <- length(arr[arr<0])
zero <- length(arr[arr==0])
a <- sprintf(fmt= "%.06f", positive/full_length)
b <- sprintf(fmt= "%.06f", negative/full_length)
c <- sprintf(fmt= "%.06f", zero/full_length)
message(paste0(a,"\n",b,"\n",c,"\n"))
}
arr <- as.integer(c(-4, 3, -9, 0, 4, 1))
plusMinus(arr)
max_values <- arr[-which(min(arr))]
max_values <- arr[-min(arr)]
min(arr)
which(arr[min(arr)])
arr[min(arr)]
arr[min(arr[1])]
min(arr)
min(arr) %in% arr
arr(arr %in% min(arr))
arr[arr %in% min(arr)]
which[arr %in% min(arr)]
which(arr %in% min(arr))
max_values <- arr[-(which(arr %in% min(arr)))]
#
# Complete the 'plusMinus' function below.
#
# The function accepts INTEGER_ARRAY arr as parameter.
#
miniMaxSum <- function(arr) {
max_values <- arr[-(which(arr %in% min(arr)))][1]
min_values <- arr[-(which(arr %in% max(arr)))][1]
max_summed <- sum(max_values)
min_values <- sum(min_values)
message(paste(min_values, " ", max_values))
}
arr <- as.integer(c(-4, 3, -9, 0, 4, 1))
miniMaxSum(arr)
max_values <- arr[-(which(arr %in% min(arr)))][1]
min_values <- arr[-(which(arr %in% max(arr)))][1]
max_values <- arr[-(which(arr %in% min(arr)))]
min_values <- arr[-(which(arr %in% max(arr)))]
which(arr %in% min(arr))
which(arr %in% min(arr))[1]
max_values <- arr[-(which(arr %in% min(arr))[1])]
min_values <- arr[-(which(arr %in% max(arr))[1])]
max_summed <- sum(max_values)
min_values <- sum(min_values)
miniMaxSum <- function(arr) {
max_values <- arr[-(which(arr %in% min(arr))[1])]
min_values <- arr[-(which(arr %in% max(arr))[1])]
max_summed <- sum(max_values)
min_summed <- sum(min_values)
message(paste(min_summed, " ", max_summed))
}
arr <- as.integer(c(-4, 3, -9, 0, 4, 1))
miniMaxSum(arr)
max_values <- arr[-(which(arr %in% min(arr))[1])]
min_values <- arr[-(which(arr %in% max(arr))[1])]
max_summed <- sum(max_values)
min_summed <- sum(min_values)
miniMaxSum <- function(arr) {
max_values <- arr[-(which(arr %in% min(arr))[1])]
min_values <- arr[-(which(arr %in% max(arr))[1])]
max_summed <- sum(max_values)
min_summed <- sum(min_values)
message(paste(min_summed, " ", max_summed))
}
arr <- as.integer(c(7, 69, 2, 221, 8974))
miniMaxSum(arr)
# Set the current session
session_packages = c("knitr","kableExtra","tidyverse","metap",
"lme4","lmerTest","variancePartition","edgeR")
for (lib_name in session_packages){
tryCatch({library(lib_name,character.only = T)}, error = function(e) {
print(paste("Cannot load",lib_name,", please install"))
})
}
# PROVIDE PATH TO THE CONFIG FILE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is the only parameter that you need to change here
config <- rjson::fromJSON(file = "/Users/johf/Documents/motrpac_precovid_config.json")
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
setwd( file.path(normalizePath(config$precovid_repo_path), "QC") )
tmp <- file.path( file.path(normalizePath(config$precovid_repo_path), "QC", "/tmp") )
if (!dir.exists(tmp)){
dir.create(tmp)
}else{
print("tmp exists")
}
# Load previous metabolomics analysis
load(paste0(tmp,"/metabolomics-all-qc-input-for-da.RData"))
# Load KW test results
load(paste0(tmp,"/supplemental_data.RData"))
knitr::opts_chunk$set(echo = TRUE)
# Define the datasets that need to be sample-centered
# These datasets are untargeted and do not have sex or exercise group associated
# with median intensity or median quartile
sample_center_datasets = supplemental_data$sampling_centering_decisions %>%
group_by(dataset) %>%
slice_min(sample_ctr) %>%
slice_head(n = 1) %>%
filter(sample_ctr == 1) %>%
pull(dataset)
for (currname in names(metabolomics_processed_datasets)){
curr_norm_data = metabolomics_processed_datasets[[currname]]$normalized_data
# Targeted data
if(grepl("metab-t",currname)){
# If more than 12 metabolites, use log2 KNN imputed data
if(nrow(curr_norm_data$raw) > 12){
curr_data_use = curr_norm_data$log2_imputed
print(paste0(currname, " is targeted, and has ", nrow(curr_norm_data$raw), " features, so we will use log2 imputed data"))
# Otherwise use log2 raw data
} else {
curr_data_use = log2(1 + curr_norm_data$raw)
print(paste0(currname, " is targeted, and has ", nrow(curr_norm_data$raw), " features, so we will use log2 data that is NOT imputed"))
}
# Untargeted data
} else if (grepl("metab-u",currname)){
# If dataset is annotated for sample-centering, use log2 imputed median-MAD
if(currname %in% sample_center_datasets){
curr_data_use = curr_norm_data$log2_median_mad
print(paste0(currname, " is untargeted, and per KW tests, will be sample-centered"))
# Otherwise use log2 imputed
} else {
curr_data_use = curr_norm_data$log2_imputed
print(paste0(currname, " is untargeted, and per KW tests, will NOT be sample-centered"))
}
} else {
print("Error: Dataset not listed as targeted or untargeted")
next
}
# Create the entry of the new dataset and add the data to use
metabolomics_processed_datasets[[currname]]$data_use = curr_data_use
}
# Define pre-exercise groups for each randomization group
pre_exercise_groups <- list(
ADUResist = "ADUResist.Pre",
ADUControl = "ADUControl.Pre",
ADUEndur = "ADUEndur.Pre"
)
for (currname in names(metabolomics_processed_datasets)){
# Parse meta data, make group timepoint (gtp), and define other covariates
curr_meta = metabolomics_processed_datasets[[currname]]$sample_meta %>%
mutate(gtp = interaction(randomgroupcode, gsub(" ","",timepoint))) %>%
mutate(gtp = droplevels(as.factor(gtp))) %>%
mutate(race_grouped = ifelse(race == "cauc","cauc",
ifelse(race == "aablack","aablack","raceoth"))) %>%
mutate(race_grouped = as.factor(race_grouped)) %>%
mutate(race = droplevels(as.factor(race))) %>%
mutate(gender = as.factor(ifelse(registration.sex == "1","F","M"))) %>%
mutate(numerical_age = ifelse(registration.sex == "1",0.5,-0.5)) %>%
mutate(bmi = as.numeric(wtkg_pcaa/((htcmavg_hwwt/100)^2))) %>%
mutate(age = as.numeric(calculatedage)) %>%
mutate(sample_order = as.numeric(sample_order)) %>%
mutate(raw_intensity = as.numeric(raw_intensity)) %>%
mutate(codedsiteid = droplevels(as.factor(codedsiteid))) %>%
mutate(staffid = droplevels(as.factor(staffid))) %>%
mutate(pid = droplevels(as.factor(pid)))
# Create all possible pairwise combinations that exclude pre-exercise group timepoints
all_combinations <- expand.grid(setdiff(levels(curr_meta$gtp), unlist(pre_exercise_groups)),
setdiff(levels(curr_meta$gtp), unlist(pre_exercise_groups)))
# Remove combinations that contain the same group timepoint
all_combinations <- all_combinations[all_combinations$Var1 != all_combinations$Var2,]
contrast_expressions <- c()
for (i in 1:nrow(all_combinations)) {
# Identify the corresponding pre-exercise name for the randomization group of interest
pre_ex_Var1 <- pre_exercise_groups[[gsub("\\..*$", "", all_combinations$Var1[i])]]
pre_ex_Var2 <- pre_exercise_groups[[gsub("\\..*$", "", all_combinations$Var2[i])]]
# Create the contrast expression
temp_contrast_expressions <- paste0("gtp", all_combinations$Var1[i], " - ",
"gtp", pre_ex_Var1," - ",
"gtp", all_combinations$Var2[i], " + ",
"gtp", pre_ex_Var2)
contrast_expressions <- c(contrast_expressions, temp_contrast_expressions)
}
# Create the entry of contrast expressions to use and update meta data
metabolomics_processed_datasets[[currname]]$contrast_expressions = contrast_expressions
metabolomics_processed_datasets[[currname]]$sample_meta = curr_meta
}
# Define parameters for paralell computing
num_cores = parallel::detectCores()-8
param = SnowParam(num_cores, "SOCK", progressbar=TRUE)
# Define coefficients for formula
fixed_covars = c("gtp + race + gender + bmi + age + codedsiteid + ",
"gtp + race_grouped + gender + bmi + age + codedsiteid + ",
"gtp + Latinx + gender + bmi + age + codedsiteid + ",
"gtp + gender + bmi + age + codedsiteid + "
)
technical_covars = c("staffid + ",
"raw_intensity + "
)
random_covars = "(1 | pid)"
# Create DA tables for each dataset
for(currname in names(metabolomics_processed_datasets)) {
print(paste("----- STARTING ", currname, " -----"))
dream_all_res = data.frame()
curr_data = metabolomics_processed_datasets[[currname]]
# Extract peak abundance data and meta data
# Check that the sample names match between both data frames
curr_meta = curr_data$sample_meta
curr_counts = curr_data$data_use[,colnames(curr_data$data_use) %in% rownames(curr_meta)]
curr_meta = curr_meta[colnames(curr_counts),]
for(fixed_covar in fixed_covars) {
for(technical_covar in technical_covars) {
curr_formula = as.formula(paste0("~ 0 + ", fixed_covar, technical_covar, random_covars))
# Make contrasts for dream, assigning all possible contrast expressions
curr_contrasts = variancePartition::makeContrastsDream(formula = curr_formula,
data = curr_meta,
contrasts = curr_data$contrast_expressions)
print(paste0("Starting DREAM analysis for formula: ", paste0("~ 0 + ", fixed_covar, technical_covar, random_covars)))
# Perform dream function. Not all abundance dataframes are matrices, so it is
# important to redefine the dataframe as a matrix
curr_fit = dream(exprObj = as.matrix(curr_counts),
formula = curr_formula,
data = curr_meta,
BPPARAM = param,
L = curr_contrasts )
curr_fit = variancePartition::eBayes(curr_fit)
# Extract results for each coefficient
for(curr_var in colnames(curr_fit$t)){
dream_res = variancePartition::topTable(curr_fit,
coef = curr_var, number = Inf, sort.by = "none")
curr_res = data.frame(
feature_ID = rownames(dream_res),
comparison_group = curr_var,
model = paste0("~ 0 + ", fixed_covar, technical_covar, random_covars),
logFC = dream_res$logFC,
tscore = dream_res$t,
p_value = dream_res$P.Value,
adj_p_value = dream_res$adj.P.Val,
model.F = curr_fit$`F`,
model.F.pval = curr_fit$F.p.value
)
# bind_rows() is WAY faster than rbind() because the function does not
# need to reinitialize a new object each iteration
dream_all_res = dream_all_res %>% bind_rows(curr_res)
}
}
}
# Store differential abundance output dataframe
metabolomics_processed_datasets[[currname]]$dream_res = dream_all_res
}
# Define parameters for paralell computing
num_cores = parallel::detectCores()-8
param = SnowParam(num_cores, "SOCK", progressbar=TRUE)
# Define coefficients for formula
fixed_covars = c("gtp + race + gender + bmi + age + codedsiteid + ",
"gtp + race_grouped + gender + bmi + age + codedsiteid + ",
"gtp + Latinx + gender + bmi + age + codedsiteid + ",
"gtp + gender + bmi + age + codedsiteid + "
)
technical_covars = c("",
#"staffid + ",
"raw_intensity + "
)
random_covars = "(1 | pid)"
# Create DA tables for each dataset
for(currname in names(metabolomics_processed_datasets)) {
print(paste("----- STARTING ", currname, " -----"))
dream_all_res = data.frame()
curr_data = metabolomics_processed_datasets[[currname]]
# Extract peak abundance data and meta data
# Check that the sample names match between both data frames
curr_meta = curr_data$sample_meta
curr_counts = curr_data$data_use[,colnames(curr_data$data_use) %in% rownames(curr_meta)]
curr_meta = curr_meta[colnames(curr_counts),]
for(fixed_covar in fixed_covars) {
for(technical_covar in technical_covars) {
curr_formula = as.formula(paste0("~ 0 + ", fixed_covar, technical_covar, random_covars))
# Make contrasts for dream, assigning all possible contrast expressions
curr_contrasts = variancePartition::makeContrastsDream(formula = curr_formula,
data = curr_meta,
contrasts = curr_data$contrast_expressions)
print(paste0("Starting DREAM analysis for formula: ", paste0("~ 0 + ", fixed_covar, technical_covar, random_covars)))
# Perform dream function. Not all abundance dataframes are matrices, so it is
# important to redefine the dataframe as a matrix
curr_fit = dream(exprObj = as.matrix(curr_counts),
formula = curr_formula,
data = curr_meta,
BPPARAM = param,
L = curr_contrasts )
curr_fit = variancePartition::eBayes(curr_fit)
# Extract results for each coefficient
for(curr_var in colnames(curr_fit$t)){
dream_res = variancePartition::topTable(curr_fit,
coef = curr_var, number = Inf, sort.by = "none")
curr_res = data.frame(
feature_ID = rownames(dream_res),
comparison_group = curr_var,
model = paste0("~ 0 + ", fixed_covar, technical_covar, random_covars),
logFC = dream_res$logFC,
tscore = dream_res$t,
p_value = dream_res$P.Value,
adj_p_value = dream_res$adj.P.Val,
model.F = curr_fit$`F`,
model.F.pval = curr_fit$F.p.value
)
# bind_rows() is WAY faster than rbind() because the function does not
# need to reinitialize a new object each iteration
dream_all_res = dream_all_res %>% bind_rows(curr_res)
}
}
}
# Store differential abundance output dataframe
metabolomics_processed_datasets[[currname]]$dream_res = dream_all_res
}
setwd("~/Documents/GitHub/MitochondrialGeneAnalysis")
library(devtools)
usethis::use_github_action()