Merge pull request #10 from JohannesGawron/main

Updates extended data Figure 7
cbg-ethz · Jan 17, 2025 · 6b4f77d · 6b4f77d
2 parents 1d7f2c5 + 190b618
commit 6b4f77d
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 105 deletions.
diff --git a/experiments/barcoding_experiment/extended_data_figure_7a.R b/experiments/barcoding_experiment/extended_data_figure_7a.R
@@ -1,11 +1,13 @@
-library(ggplot2)
+library(tidyverse)
 library(boot)
-library(smoothr)
+# library(smoothr)
 
-setwd("~/Downloads")
+setwd("~/work/ctc-data/barcoding_experiment/combined_primary_cluster/")
 
 tables <- list()
-filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds")
+
+filelist <- list.files(path = "~/work/ctc-data/barcoding_experiment/combined_primary_cluster/", pattern = "^combined.*_filter_merge\\.rds$")
+
 for (file in filelist) {
   tables[[file]] <- readRDS(file)
   tables[[file]]$observations <- paste0(file, tables[[file]]$observations)
@@ -36,111 +38,127 @@ merged_table2 <- merged_table %>%
   ungroup()
 
 
-fit <-
-  glm(
-    present ~ log(prop_av),
-    data = merged_table2, family = binomial(link = "logit")
-  )
-fit1 <-
-  glm(
-    present ~ prop_av,
-    data = merged_table2, family = binomial(link = "logit")
-  )
-fit2 <-
-  glm(
-    present ~ log(prop_av),
-    data = merged_table2, family = binomial(link = "log")
-  )
-fit3 <-
-  glm(present ~ logit(prop_av),
-    data = merged_table2, family = binomial(link = "logit")
-  )
-fit4 <-
-  glm(present ~ logit(prop_av),
-    data = merged_table2, family = binomial(link = "log")
-  )
+color <- "#3C8181"
 
-AIC(fit)
-AIC(fit1)
-AIC(fit2)
-AIC(fit3)
-AIC(fit4)
+ggplot(merged_table2, aes(x = prop_av, y = present)) +
+  geom_jitter(width = 0, height = 0.1, color = color) +
+  geom_smooth(
+    method = "glm",
+    method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
+  ) +
+  labs(
+    x = "Clonal frequency in primary tumor",
+    y = "P(barcode is present in CTC cluster)"
+  ) +
+  geom_abline(linetype = "dotted", slope = 1, intercept = 0)
 
-### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x
-## coordinates to a well defined range, so I will use fit3.
-fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table)
-AIC(fit_linear_model)
 
-merged_table %>%
-  ggplot(aes(y = freq, x = prop_av)) +
-  geom_point() +
-  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
-  geom_smooth(method = "lm", formula = y ~ x)
+get_mean_quantile <- function(cutoff, merged_table, upper = TRUE) {
+  if (upper) {
+    return(mean(merged_table$prop_av[merged_table$prop_av > cutoff]))
+  } else {
+    return(mean(merged_table$prop_av[merged_table$prop_av < cutoff]))
+  }
+}
 
-merged_table %>%
-  ggplot(aes(y = freq, x = log(prop_av))) +
-  geom_point() +
-  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
-  geom_smooth(method = "lm")
+get_mean_quantile2 <- function(cutoff, merged_table, upper = TRUE) {
+  if (upper) {
+    return(mean(merged_table$freq[merged_table$prop_av > cutoff]))
+  } else {
+    return(mean(merged_table$freq[merged_table$prop_av < cutoff]))
+  }
+}
 
 
-merged_table %>%
-  ggplot(aes(y = logit(freq), x = logit(prop_av))) +
-  geom_point() +
-  labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") +
-  geom_smooth(method = "lm", formula = y ~ x)
+boot_wrapper <- function(data, indices, cutoff, upper = TRUE) {
+  # Subset the data based on bootstrap sample indices
+  sampled_data <- data[indices, ]
+  # Call the target function
+  return(get_mean_quantile2(cutoff, sampled_data, upper))
+}
 
+# Perform the bootstrap
+set.seed(123) # For reproducibility
+cutoff <- merged_table$prop_av %>% quantile(0.999) # 0.1
+bootstrap_replicates <- 1000
+bootstrap_results <- boot(
+  data = merged_table,
+  statistic = function(data, indices) boot_wrapper(data, indices, cutoff, upper = TRUE),
+  R = bootstrap_replicates # Number of bootstrap replicates
+)
+# Print bootstrap results
+print(bootstrap_results)
 
+bootstrap_statistics <- bootstrap_results$t
 
-fit_logit_logit_linear <-
-  lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table)
-plot(fit_logit_logit_linear)
-primary_VAF <- seq(0.01, 0.55, 0.01)
+# Convert to a data frame for ggplot2
+bootstrap_df <- data.frame(stats = bootstrap_statistics, bias = bootstrap_statistics - get_mean_quantile(cutoff, merged_table, upper = TRUE), upper = TRUE, mean_fraction_primary = get_mean_quantile(cutoff, merged_table, upper = TRUE))
 
-predicted <- predict(fit4,
-  newdata = data.frame(prop_av = primary_VAF),
-  type = "response"
+bootstrap_results <- boot(
+  data = merged_table,
+  statistic = function(data, indices) boot_wrapper(data, indices, cutoff, upper = FALSE),
+  R = bootstrap_replicates # Number of bootstrap replicates
 )
+# Print bootstrap results
+print(bootstrap_results)
 
-color <- "#3C8181"
+bootstrap_statistics <- bootstrap_results$t
 
-ggplot(merged_table2, aes(x = prop_av, y = present)) +
-  geom_jitter(width = 0, height = 0.1, color = color) +
-  geom_smooth(
-    method = "glm",
-    method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
-  ) +
+
+bootstrap_df2 <- data.frame(stats = bootstrap_statistics, bias = bootstrap_statistics - get_mean_quantile(cutoff, merged_table, upper = FALSE), upper = FALSE, mean_fraction_primary = get_mean_quantile(cutoff, merged_table, upper = FALSE))
+bootstrap_df <- rbind(bootstrap_df, bootstrap_df2)
+
+bootstrap_df$mean_fraction_primary <- as.factor(bootstrap_df$mean_fraction_primary)
+mean_frac_primary_upper <- levels(bootstrap_df$mean_fraction_primary)[2]
+
+ggplot(bootstrap_df, aes(x = upper, y = stats)) +
+  geom_boxplot(fill = "#41B7C4", color = "#3C8181") +
   labs(
-    x = "Clonal frequency in primary tumor",
-    y = "P(barcode is present in CTC cluster)"
+    # title = "Shift in abundancy of highly against lowly represented clones",
+    y = "Mean frequency among CTC clusters",
+    x = ""
   ) +
-  geom_abline(linetype = "dotted", slope = 1, intercept = 0)
+  theme_classic() +
+  theme(
+    # axis.title = element_text(size = 0),
+    legend.title = element_text(size = 0),
+    legend.text = element_text(size = 0),
+    axis.text.x = element_text(angle = 45, hjust = 1)
+  ) +
+  annotate(
+    "segment",
+    x = 1.5,
+    xend = 2.5,
+    y = as.numeric(mean_frac_primary_upper),
+    yend = as.numeric(mean_frac_primary_upper),
+    linetype = "dashed",
+    color = "red"
+  ) +
+  annotate(
+    "text",
+    x = 2,
+    y = as.numeric(mean_frac_primary_upper) - 0.01,
+    label = "Mean frequency in primary tumor",
+    color = "red",
+    size = 4,
+    fontface = "italic"
+  ) +
+  scale_x_discrete(
+    labels = c("Lowly abundant clones", "Highly abundant clones")
+  )
+# fill = "#41B7C4", color ="#3C8181"
 
+ggsave(
+  "~/work/ctc-data/barcoding_experiment/extended_data_figure_7a.pdf",
+  width = 6, height = 4, units = "in"
+)
 
 merged_table %>%
   ggplot(aes(y = freq, x = prop_av)) +
   geom_point() +
-  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
-  geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted))
-
-means <- rep(NA, 100)
-for (i in 1:25) {
-  window_start <- (i - 1) / 50
-  window_end <- i / 50
-  means[i] <- merged_table2 %>%
-    dplyr::filter(prop_av > window_start) %>%
-    dplyr::filter(prop_av <= window_end) %>%
-    pull(present) %>%
-    mean()
-}
+  labs(x = "tumor VAF", y = "fraction among CTC clusters")
 
-ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) +
-  geom_point()
 
-ggsave(
-  "~/Desktop/clonal_prevalence2.pdf",
-  width = 3.6, height = 3.6, units = "in"
-)
 
 cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman")
 cor.test(

diff --git a/experiments/barcoding_experiment/extended_data_figure_7b.R b/experiments/barcoding_experiment/extended_data_figure_7b.R
@@ -2,7 +2,6 @@
 ## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator
 
 library(MCMCprecision)
-library(ggplot2)
 library(tidyverse)
 library(poolr)
 library(ComplexHeatmap)
@@ -33,14 +32,24 @@ compute_p_value <- function(k, n, r, samplingSize, primaryTumorVector) {
 
 
 
-summary_df_filter <- readRDS("summary_df_filter_final.rds")
-summary_df_filter_combined <- summary_df_filter %>%
-  mutate(group = ifelse(value %in% c(100, 1000), "100-1000",
-    ifelse(value == 10000, "10000", "50000")
+summary_df_filter <- readRDS("~/work/ctc-data/Barcoding experiment/summary_df_filter_final.rds")
+summary_df_filter2 <- readRDS("~/work/ctc-data/Barcoding experiment/summary_df_filter_rev_bc_ident.rds")
+
+colnames(summary_df_filter)[6] <- "pt_complexity"
+colnames(summary_df_filter2)[15] <- "tumor_sample"
+
+summary_df_filter2$tumor_sample <- as.character(summary_df_filter2$tumor_sample)
+
+summary_df_filter_combined <- bind_rows(summary_df_filter, summary_df_filter2)
+
+
+summary_df_filter_combined <- summary_df_filter_combined %>%
+  mutate(group = ifelse(pt_complexity %in% c(100, 1000), "100-1000",
+    ifelse(pt_complexity == 10000, "10000", "50000")
   ))
 
+write_csv(summary_df_filter_combined, file = "~/work/ctc-data/Barcoding experiment/summary_df_filter_final_all.csv")
 
-setwd("~/Documents/projects/CTC_backup/validation_experiment/")
 
 
 p_values_table <- data.frame(matrix(ncol = 0, nrow = 24))
@@ -49,13 +58,15 @@ mono_clusters <- data.frame(matrix(ncol = 0, nrow = 24))
 expected_values_null_table <- data.frame(matrix(ncol = 0, nrow = 24))
 fold_change_table <- data.frame(matrix(ncol = 0, nrow = 24))
 
-mouse_models <- c("140", "141", "902", "903", "904", "905", "910")
+mouse_models <- unique(summary_df_filter_combined$tumor_sample)
 
 for (mouse_model in mouse_models) {
-  setwd("Cluster_csv_files")
+  setwd("~/work/ctc-data/Barcoding experiment/Cluster data")
+
   files <- list.files(pattern = paste0("^.*_", mouse_model, ".*\\.csv$"))
   print(files)
-  summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]
+  summary_temp <- summary_df_filter_combined %>% filter(tumor_sample == mouse_model)
+  # summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]
 
 
   total_number_of_clusters_by_size <- summary_temp %>%
@@ -72,13 +83,13 @@ for (mouse_model in mouse_models) {
   summary_clusters_by_size <- merge(data.frame(cluster_size = 2:25), summary_clusters_by_size, by.x = "cluster_size", all.x = TRUE)
   summary_clusters_by_size[is.na(summary_clusters_by_size)] <- 0
 
-  setwd("../Primary_tumor_csv_files")
+  setwd("~/work/ctc-data/Barcoding experiment/primary_data")
 
 
   primary_files <- list.files(pattern = paste0("^.*", mouse_model, ".*\\.csv$"))
 
   print("Loading primary tumor data:")
-  primaryA <- read_delim(primary_files[1], col_names = F, delim = "\t")
+  primaryA <- read_delim(primary_files[1], col_names = FALSE, delim = "\t")
 
 
   colnames(primaryA)[1] <- "barcodes"
@@ -105,7 +116,7 @@ for (mouse_model in mouse_models) {
   }
 
   print("Done!")
-  setwd("..")
+
 
   primary_counts <- primary_counts %>%
     group_by(barcodes) %>%
@@ -171,10 +182,10 @@ log_fold_change_table <- log(fold_change_table + 1)
 
 
 frequency_table <- t(frequency_table[1:4, ])
-frequency_table <- frequency_table[c(5, 6, 2, 1, 3, 4, 7), ]
+frequency_table <- frequency_table[c(9, 11, 4, 3, 10, 12, 8, 5, 6, 2, 1, 7), ]
 
 p_values_table <- t(p_values_table[1:4, ])
-p_values_table <- p_values_table[c(5, 6, 2, 1, 3, 4, 7), ]
+p_values_table <- p_values_table[c(9, 11, 4, 3, 10, 12, 8, 5, 6, 2, 1, 7), ]
 
 fold_change_table[is.na(fold_change_table)] <- -1
 log_fold_change_table[is.na(log_fold_change_table)] <- -1
@@ -235,11 +246,11 @@ ha_col <- HeatmapAnnotation(
 
 ha_row <- rowAnnotation(
   foo = anno_text(x = row_annotation, which = "row", show_name = TRUE, gp = gpar(fontsize = 8)),
-  annotation_label = c("Cluster size"), annotation_name_gp = gpar(fontsize = 10)
+  annotation_label = c(""),
+  annotation_name_gp = gpar(fontsize = 10)
 )
 
 
-
 ht <- Heatmap(frequency_table,
   cell_fun = function(j, i, x, y, w, h, fill) {
     if (!is.na(p_values_table[i, j])) {
@@ -260,15 +271,22 @@ ht <- Heatmap(frequency_table,
   cluster_columns = FALSE,
   col = col_fun,
   heatmap_legend_param = list(
-    title = "% mono",
+    title = "Monoclonal fraction",
     at = c(0, 0.5, 1), labels = c("0", "50", "100")
   ),
   top_annotation = ha_col,
   left_annotation = ha_row
 )
 
 lgd_sig <- Legend(pch = c("*", "**", "***", "-"), type = "points", labels = c("<0.05", "<0.01", "<0.001", "NA"), legend_gp = gpar(fontsize = 8))
-draw(ht, annotation_legend_list = list(lgd_sig))
+
+pdf("~/work/ctc-data/Barcoding experiment/Supplementary Figure 7b.pdf", width = 6, height = 4)
+draw(ht, annotation_legend_list = list(lgd_sig), padding = unit(c(3, 1, 1, 1), "lines"))
+pushViewport(viewport(layout = grid.layout(1, 1))) # Ensure single viewport
+grid.text("CTC cluster size (n cells)", x = 0.35, y = unit(0.05, "npc"), gp = gpar(fontsize = 10))
+
+dev.off()
+
 
 ##### Same only for fold-change