diff --git a/.gitignore b/.gitignore index 583cfde..56ccab1 100755 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ slurm* Rcode/.Rproj.user Rcode/markdowns/* Rcode/*.html +Rcode/.Rhistory +Rcode/Rprofile !Rcode/markdowns/template.Rmd experiments/.snakemake/* .Rapp.history diff --git a/Rcode/.Rhistory b/Rcode/.Rhistory index a56c7aa..f9265e5 100644 --- a/Rcode/.Rhistory +++ b/Rcode/.Rhistory @@ -1,512 +1,512 @@ -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -ab_line(intercept = coef(fit[1]), slope = coef(fit[2]))+ -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit[1]), slope = coef(fit[2]))+ -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -?geom_abline -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2])+ -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -geom_line(data = data.frame(x = X), aes(x = log(x), y = (coef(fit)[1] + coef(fit)[2] * log(x))-log(x) )) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -# geom_line(data = data.frame(x = X), aes(x = log(x), y = (coef(fit)[1] + coef(fit)[2] * log(x))-log(x) )) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -geom_line(data = data.frame(x = X), aes(x = log(x), y = (coef(fit)[1] + coef(fit)[2] * log(x))-log(x) )) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -# geom_line(data = data.frame(x = X), aes(x = log(x), y = (coef(fit)[1] + coef(fit)[2] * log(x))-log(x) )) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = mean(prop_av), response = inv.logit(empirical_link(fraction_of_clusters, family = binomial(link = 'log')))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(q)', y = 'log(p)') + -geom_line(data = data.frame(x = X), aes(x = x, y = inv.logit(coef(fit)[1] + coef(fit)[2] * logit(x)))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = mean(prop_av), response = inv.logit(empirical_link(fraction_of_clusters, family = binomial(link = 'log')))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(q)', y = 'log(p)') + -geom_line(data = data.frame(x = X), aes(x = x, y = exp(coef(fit)[1] + coef(fit)[2] * log(x)))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_smooth(method = glm, se = TRUE, family = binomial(link = 'log')) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = mean(prop_av), response = inv.logit(empirical_link(fraction_of_clusters, family = binomial(link = 'log')))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(q)', y = 'log(p)') + -geom_line(data = data.frame(x = X), aes(x = x, y = exp(coef(fit)[1] + coef(fit)[2] * log(x)))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = mean(prop_av), response = inv.logit(empirical_link(fraction_of_clusters, family = binomial(link = 'log')))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = mean(prop_av), response = exp(empirical_link(fraction_of_clusters, family = binomial(link = 'log')))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'tumorVAF', y = 'log(p)') + -geom_line(data = data.frame(x = X), aes(x = x, y = exp(coef(fit)[1] + coef(fit)[2] * log(x)))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_smooth(method = glm, se = TRUE, family = binomial(link = 'log')) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_smooth(method = glm, se = TRUE, family = binomial(link = 'log')) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_smooth(method = glm, se = TRUE, family = binomial(link = 'log')) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_smooth(method = glm, se = TRUE, family = binomial(link = 'log')) + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -library(tidyverse) -setwd('~/Documents/projects/CTC_backup/validation_experiment/') -data <- readRDS('combi_df.rds') -View(data) -data <- data %>% mutate(counts/freq, complexity = as.numeric(complexity)) -summary(data) -data2 <- data %>% group_by(observations) %>% slice(rep(1:n(), counts/freq)) %>% mutate(present = row_number() <= first(counts)) %>% -ungroup() -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = 'log')) -summary(fit) -confint(fit) -coef(fit) -beta1 <- coef(fit)[2] -beta0 <- coef(fit)[1] -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -summary(fit) -??bin_by_quantile -library(regressinator) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') + -geom_smooth(method = 'glm',method.args = list(family = binomial(link = 'log'))) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') + -geom_smooth(method = 'glm',method.args = list(family = binomial(link = 'log'))) + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log')) - log(mean(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) %>% -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, family = binomial(link = 'log'), color = 'red') -data %>% -ggplot(aes(y = freq, x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, family = binomial(link = 'log'), color = 'red') -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, family = binomial(link = 'log'), color = 'red') -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, family = binomial(link = 'log'), color = 'red') -?geom_smooth -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, method.args = list(family = binomial(link = 'log')), color = 'red') -data %>% -ggplot(aes(y = freq, x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, method.args = list(family = binomial(link = 'log')), color = 'red') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) + -geom_smooth(method = 'glm', se = TRUE, method.args = list(family = binomial(link = 'log')), color = 'red') -data %>% -ggplot(aes(y = freq, x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data2 %>% group_by(prop_av) %>% dplyr::summarize(fraction_of_clusters = sum(present)/n()) %>% -bin_by_quantile(prop_av, breaks = 100) %>% -summarize(x = log(mean(prop_av)), response = empirical_link(fraction_of_clusters, family = binomial(link = 'log'))) %>% -ggplot(aes(x = x, y = response)) + geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -library(tidyverse) -setwd('~/Documents/projects/CTC_backup/validation_experiment/') -data <- readRDS('combi_df.rds') -View(data) -data <- data %>% mutate(counts/freq, complexity = as.numeric(complexity)) -summary(data) -data2 <- data %>% group_by(observations) %>% slice(rep(1:n(), counts/freq)) %>% mutate(present = row_number() <= first(counts)) %>% -ungroup() -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = 'log')) -summary(fit) -confint(fit) -coef(fit) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = 'log')) -fit -summary(fit) -confint(fit) -summary(fit) -summary(fit) -(coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error']) -coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error']) -coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -(coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -critical_value <- qt(0.975, df) -df <- nrow(data)-ncol(model.matrix(fit)) -df -df <- nrow(data2)-ncol(model.matrix(fit)) -df -summary(fit) -critical_value <- qt(0.975, df) -critical_value -t-value <- (coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -t-value -t-value <- (coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -t.value <- (coef(fit)[1]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -df <- nrow(data2)-ncol(model.matrix(fit)) -p.value <- 2* pt(0.975, df) -p.value <- 2* 1-pt(abs(t.value), df) -p.value -t.value <- (coef(fit)[2]-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -coef(fit) -t.value <- (coef(fit)['log(prop_av)']-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -t.value -coef(fit)['log(prop_av)'] -t.value <- (coef(fit)['log(prop_av)']-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -df <- nrow(data2)-ncol(model.matrix(fit)) -p.value <- 2* 1-pt(abs(t.value), df) -p.value -summary(fit) -df -p.value <- 2* (1-pt(abs(t.value), df)) -p.value -p.value <- 2* (1-pt(abs(t.value), df)) -pt(abs(t.value), df) -p.value <- 2* (1-pt(abs(t.value), df)) -fit <- glm(present ~ log(prop_av), data = data2, offset = 1,family = binomial(link = 'log')) -t.value <- (coef(fit)['log(prop_av)']-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -t.value -p.value <- 2* (1-pt(abs(t.value), df)) -p.value -t.value -df -coef(fit) -coef(fit)['log(prop_av)'] -coef(fit)['log(prop_av)']-1) -summary(fit)$coefficients['log(prop_av)','Std. Error'] -?confint -confint(fit, level = 0.99999999) -confint(fit, level = 0.999999999999) -?glm -fit <- glm(present ~ log(prop_av), offset = 1,data = data2, family = binomial(link = 'log')) -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = 'log')) -t.value <- (coef(fit)['log(prop_av)']-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -df <- nrow(data2)-ncol(model.matrix(fit)) -p.value <- 2* (1-pt(abs(t.value), df)) -p.value -summary(fit) -p.value <- 2.2e-16 -p.value <- 2* (1-pt(abs(t.value), df)) -print(p.value) -p.value <- 2.2e-16 -t -t -t -t -library(tidyverse) -setwd('~/Documents/projects/CTC_backup/validation_experiment/') -data <- readRDS('combi_df.rds') -View(data) -data <- data %>% mutate(counts/freq, complexity = as.numeric(complexity)) -summary(data) -data2 <- data %>% group_by(observations) %>% slice(rep(1:n(), counts/freq)) %>% mutate(present = row_number() <= first(counts)) %>% -ungroup() -fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = 'log')) -summary(fit) -confint(fit) -coef(fit) -data %>% -ggplot(aes(y = log(freq), x = log(prop_av))) + -geom_point() + -labs(x = 'log(tumorVAF)', y = 'log(fraction among CTC clusters)') + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]) -data %>% -ggplot(aes(y = log(ratio), x = log(prop_av))) + -geom_point() + -geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2]-1) + -labs(x = 'log(tumorVAF)', y = 'log(fold-change in prevalence)') -summary(fit) -t.value <- (coef(fit)['log(prop_av)']-1)/summary(fit)$coefficients['log(prop_av)','Std. Error'] -df <- nrow(data2)-ncol(model.matrix(fit)) -p.value <- 2* (1-pt(abs(t.value), df)) -print(p.value) +}, +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("*", "**", "***", "-"), type = "points", labels = c("< 0.05", "<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +columns_annotation <- c() +for(value in p_values_columns){ +if(value < 0.001){ +columns_annotation <- c(columns_annotation, "***") +} +else if(value < 0.01){ +columns_annotation <- c(columns_annotation, "**") +} +else if(value < 0.05){ +columns_annotation <- c(columns_annotation, "*") +} +else { +columns_annotation <- c(columns_annotation, "") +} +} +row_annotation <- c() +for(value in p_values_rows){ +if(value < 0.001){ +row_annotation <- c(row_annotation, "***") +} +else if(value < 0.01){ +row_annotation <- c(row_annotation, "**") +} +else if(value < 0.05){ +row_annotation <- c(row_annotation, "*") +} +else { +row_annotation <- c(row_annotation, "") +} +} +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE), +annotation_label = c("Sample")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y) +} +}, +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE), +annotation_label = c("Cluster size")) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Sample")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y) +} +}, +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Cluster size")) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Sample")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y) +} +}, +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +}, +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +}, +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +columns_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +columns_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +?Heatmap +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 3), +columns_names_gp = gpar(fontsize = 3), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 1), +columns_names_gp = gpar(fontsize = 1), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 1), +columns_names_gp = gpar(fontsize = 1), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 5), +columns_names_gp = gpar(fontsize = 5), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +?Heatmap +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 5), +column_names_gp = gpar(fontsize = 5), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +column_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Cluster size", gpar(fontsize = 5))) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Sample")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +column_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Cluster size", gp = gpar(fontsize = 5))) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Sample")) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +column_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Cluster size")) +ha_col <- HeatmapAnnotation(foo = anno_text(x = columns_annotation, location = 1, rot = 45, show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Cluster size"), annotation_name_gp= gpar(fontsize = 10)) +ha_row = rowAnnotation(foo = anno_text(x = row_annotation, which = "row", show_name= TRUE, gp = gpar(fontsize = 8)), +annotation_label = c("Sample"), annotation_name_gp= gpar(fontsize = 10)) +ht <- Heatmap(frequency_table, cell_fun = function(j, i, x, y, w, h, fill) { +if(!is.na(p_values_table[i, j])){ +if(p_values_table[i, j] < 0.001) { +grid.text("***", x, y, gp = gpar(fontsize = 8) ) +} else if(p_values_table[i, j] < 0.01) { +grid.text("**", x, y, gp = gpar(fontsize = 8)) +} +else if (p_values_table[i, j] < 0.05) { +grid.text("*", x, y, gp = gpar(fontsize = 8)) +} +} +else if(is.na(p_values_table[i, j])) { +grid.text("-", x, y, gp = gpar(fontsize = 8)) +} +}, +row_names_gp = gpar(fontsize = 8), +column_names_gp = gpar(fontsize = 8), +cluster_rows = FALSE, +cluster_columns = FALSE, +col = col_fun, +heatmap_legend_param = list(title = "% mono", +at = c(0, 0.5, 1), labels = c("0", "50", "100")), +top_annotation = ha_col, +left_annotation = ha_row +) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +?Legend +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA"), labels_gp = gpar(fontsize = 10)) +draw(ht, annotation_legend_list = list(lgd_sig)) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA"), labels_gp = gpar(fontsize = 8)) +draw(ht, annotation_legend_list = list(lgd_sig)) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA")) +draw(ht, annotation_legend_list = list(lgd_sig)) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA"), legend_gp = gpar(fontsize = 10)) +draw(ht, annotation_legend_list = list(lgd_sig)) +lgd_sig = Legend(pch = c("**", "***", "-"), type = "points", labels = c("<0.01", "<0.001", "NA"), legend_gp = gpar(fontsize = 8)) +draw(ht, annotation_legend_list = list(lgd_sig)) +all_pvalues +fisher(unlist(all_pvalues)) +?ComplexHeatmap +?Heatmap +?Legend diff --git a/Rcode/.Rprofile b/Rcode/.Rprofile deleted file mode 100644 index 81b960f..0000000 --- a/Rcode/.Rprofile +++ /dev/null @@ -1 +0,0 @@ -source("renv/activate.R") diff --git a/Rcode/.Rproj.user/shared/notebooks/paths b/Rcode/.Rproj.user/shared/notebooks/paths index 51b1f34..49ded41 100644 --- a/Rcode/.Rproj.user/shared/notebooks/paths +++ b/Rcode/.Rproj.user/shared/notebooks/paths @@ -1,2 +1,12 @@ +/Users/jgawron/Desktop/get_validation_p_values.R="0D6D114C" /Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/renv/activate.R="42F1FF6D" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/simulateCTCclusters.R="38C0ECFF" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/statistical_test_source.R="1386B5D8" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/validation_cluster_data.R="B0DE1791" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/validation_statistical_test.R="CE4D8CE4" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/data/markdowns/Br11.Rmd="2C66048E" +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/template.Rmd="3914C49A" +/Users/jgawron/Documents/projects/CTC_backup/software/Rcode/validation_cluster_data.R="645DA8AD" +/Users/jgawron/Documents/projects/CTC_backup/software/Rcode/validation_statistical_test.R="7B50B5CC" +/Users/jgawron/Documents/projects/CTC_backup/topSeparatingMutations/Br16_AC_topSeparators.Rmd="37A59E67" /Users/jgawron/Documents/projects/CTC_backup/validation_experiment/foldchange_in_barcode_prevalence.R="097865BC" diff --git a/Rcode/statistical_test_source.R b/Rcode/statistical_test_source.R index 8f0c06d..aa69fff 100644 --- a/Rcode/statistical_test_source.R +++ b/Rcode/statistical_test_source.R @@ -25,9 +25,9 @@ load_cluster_df <- function(filename) { -files <- list.files(path = "../../validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") +files <- list.files(path = "../../../CTC_backup/validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") -files <- map_chr(files, reverse_paste, "../../validation_experiment/Cluster_csv_files/") +files <- map_chr(files, reverse_paste, "../../../CTC_backup/validation_experiment/Cluster_csv_files/") myfiles <- map(files, load_cluster_df) @@ -48,14 +48,14 @@ myfiles <- map(files, load_cluster_df) #' @examples load_cluster_files() #' @noRD load_cluster_files <- function() { - files <- list.files(path = "../../validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") + files <- list.files(path = "../../../CTC_backup/validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") - files <- map_chr(files, reverse_paste, "../../validation_experiment/Cluster_csv_files/") + files <- map_chr(files, reverse_paste, "../../../CTC_backup/validation_experiment/Cluster_csv_files/") df_list <- map(files, load_cluster_df) - save(df_list, file = "../../validation") + # save(df_list, file = "../../../CTC_backup/validatovalidation") # create empty data frame to store summary information @@ -96,7 +96,7 @@ load_cluster_files <- function() { ) summary_df <- rbind(summary_df, summary_row) } - save(summary_df, file = "../../validation_experiment/output/summary_df_nine_ninefive.rds") + save(summary_df, file = "../../../CTC_backup/validation_experiment/output/summary_df_nine_ninefive.rds") return(list("summary" = summary_df, "cluster_data" = df_list)) } @@ -115,7 +115,7 @@ df_list <- list() basename <- gsub("\\.csv", "", "50k_7_910_b_S276_R2_001.fastq.gz_stats.csv") -df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) +df <- read_delim(paste0("../../../CTC_backup/validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) colnames(df)[1] <- "barcodes" colnames(df)[3] <- "counts" df_sorted <- df %>% arrange(desc(counts)) @@ -127,7 +127,7 @@ for (file in files) { basename <- gsub("\\.csv", "", file) tryCatch( { - df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) + df <- read_delim(paste0("../../../CTC_backup/validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) colnames(df)[1] <- "barcodes" colnames(df)[3] <- "counts" df_sorted <- df %>% arrange(desc(counts))