Skip to content

Commit

Permalink
Merge pull request #10 from JohannesGawron/main
Browse files Browse the repository at this point in the history
Updates extended data Figure 7
  • Loading branch information
JohannesGawron authored Jan 17, 2025
2 parents 1d7f2c5 + 190b618 commit 6b4f77d
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 105 deletions.
192 changes: 105 additions & 87 deletions experiments/barcoding_experiment/extended_data_figure_7a.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
library(ggplot2)
library(tidyverse)
library(boot)
library(smoothr)
# library(smoothr)

setwd("~/Downloads")
setwd("~/work/ctc-data/barcoding_experiment/combined_primary_cluster/")

tables <- list()
filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds")

filelist <- list.files(path = "~/work/ctc-data/barcoding_experiment/combined_primary_cluster/", pattern = "^combined.*_filter_merge\\.rds$")

for (file in filelist) {
tables[[file]] <- readRDS(file)
tables[[file]]$observations <- paste0(file, tables[[file]]$observations)
Expand Down Expand Up @@ -36,111 +38,127 @@ merged_table2 <- merged_table %>%
ungroup()


fit <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit1 <-
glm(
present ~ prop_av,
data = merged_table2, family = binomial(link = "logit")
)
fit2 <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "log")
)
fit3 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit4 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "log")
)
color <- "#3C8181"

AIC(fit)
AIC(fit1)
AIC(fit2)
AIC(fit3)
AIC(fit4)
ggplot(merged_table2, aes(x = prop_av, y = present)) +
geom_jitter(width = 0, height = 0.1, color = color) +
geom_smooth(
method = "glm",
method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
) +
labs(
x = "Clonal frequency in primary tumor",
y = "P(barcode is present in CTC cluster)"
) +
geom_abline(linetype = "dotted", slope = 1, intercept = 0)

### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x
## coordinates to a well defined range, so I will use fit3.
fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table)
AIC(fit_linear_model)

merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm", formula = y ~ x)
get_mean_quantile <- function(cutoff, merged_table, upper = TRUE) {
if (upper) {
return(mean(merged_table$prop_av[merged_table$prop_av > cutoff]))
} else {
return(mean(merged_table$prop_av[merged_table$prop_av < cutoff]))
}
}

merged_table %>%
ggplot(aes(y = freq, x = log(prop_av))) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm")
get_mean_quantile2 <- function(cutoff, merged_table, upper = TRUE) {
if (upper) {
return(mean(merged_table$freq[merged_table$prop_av > cutoff]))
} else {
return(mean(merged_table$freq[merged_table$prop_av < cutoff]))
}
}


merged_table %>%
ggplot(aes(y = logit(freq), x = logit(prop_av))) +
geom_point() +
labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") +
geom_smooth(method = "lm", formula = y ~ x)
boot_wrapper <- function(data, indices, cutoff, upper = TRUE) {
# Subset the data based on bootstrap sample indices
sampled_data <- data[indices, ]
# Call the target function
return(get_mean_quantile2(cutoff, sampled_data, upper))
}

# Perform the bootstrap
set.seed(123) # For reproducibility
cutoff <- merged_table$prop_av %>% quantile(0.999) # 0.1
bootstrap_replicates <- 1000
bootstrap_results <- boot(
data = merged_table,
statistic = function(data, indices) boot_wrapper(data, indices, cutoff, upper = TRUE),
R = bootstrap_replicates # Number of bootstrap replicates
)
# Print bootstrap results
print(bootstrap_results)

bootstrap_statistics <- bootstrap_results$t

fit_logit_logit_linear <-
lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table)
plot(fit_logit_logit_linear)
primary_VAF <- seq(0.01, 0.55, 0.01)
# Convert to a data frame for ggplot2
bootstrap_df <- data.frame(stats = bootstrap_statistics, bias = bootstrap_statistics - get_mean_quantile(cutoff, merged_table, upper = TRUE), upper = TRUE, mean_fraction_primary = get_mean_quantile(cutoff, merged_table, upper = TRUE))

predicted <- predict(fit4,
newdata = data.frame(prop_av = primary_VAF),
type = "response"
bootstrap_results <- boot(
data = merged_table,
statistic = function(data, indices) boot_wrapper(data, indices, cutoff, upper = FALSE),
R = bootstrap_replicates # Number of bootstrap replicates
)
# Print bootstrap results
print(bootstrap_results)

color <- "#3C8181"
bootstrap_statistics <- bootstrap_results$t

ggplot(merged_table2, aes(x = prop_av, y = present)) +
geom_jitter(width = 0, height = 0.1, color = color) +
geom_smooth(
method = "glm",
method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
) +

bootstrap_df2 <- data.frame(stats = bootstrap_statistics, bias = bootstrap_statistics - get_mean_quantile(cutoff, merged_table, upper = FALSE), upper = FALSE, mean_fraction_primary = get_mean_quantile(cutoff, merged_table, upper = FALSE))
bootstrap_df <- rbind(bootstrap_df, bootstrap_df2)

bootstrap_df$mean_fraction_primary <- as.factor(bootstrap_df$mean_fraction_primary)
mean_frac_primary_upper <- levels(bootstrap_df$mean_fraction_primary)[2]

ggplot(bootstrap_df, aes(x = upper, y = stats)) +
geom_boxplot(fill = "#41B7C4", color = "#3C8181") +
labs(
x = "Clonal frequency in primary tumor",
y = "P(barcode is present in CTC cluster)"
# title = "Shift in abundancy of highly against lowly represented clones",
y = "Mean frequency among CTC clusters",
x = ""
) +
geom_abline(linetype = "dotted", slope = 1, intercept = 0)
theme_classic() +
theme(
# axis.title = element_text(size = 0),
legend.title = element_text(size = 0),
legend.text = element_text(size = 0),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
annotate(
"segment",
x = 1.5,
xend = 2.5,
y = as.numeric(mean_frac_primary_upper),
yend = as.numeric(mean_frac_primary_upper),
linetype = "dashed",
color = "red"
) +
annotate(
"text",
x = 2,
y = as.numeric(mean_frac_primary_upper) - 0.01,
label = "Mean frequency in primary tumor",
color = "red",
size = 4,
fontface = "italic"
) +
scale_x_discrete(
labels = c("Lowly abundant clones", "Highly abundant clones")
)
# fill = "#41B7C4", color ="#3C8181"

ggsave(
"~/work/ctc-data/barcoding_experiment/extended_data_figure_7a.pdf",
width = 6, height = 4, units = "in"
)

merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted))

means <- rep(NA, 100)
for (i in 1:25) {
window_start <- (i - 1) / 50
window_end <- i / 50
means[i] <- merged_table2 %>%
dplyr::filter(prop_av > window_start) %>%
dplyr::filter(prop_av <= window_end) %>%
pull(present) %>%
mean()
}
labs(x = "tumor VAF", y = "fraction among CTC clusters")

ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) +
geom_point()

ggsave(
"~/Desktop/clonal_prevalence2.pdf",
width = 3.6, height = 3.6, units = "in"
)

cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman")
cor.test(
Expand Down
54 changes: 36 additions & 18 deletions experiments/barcoding_experiment/extended_data_figure_7b.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator

library(MCMCprecision)
library(ggplot2)
library(tidyverse)
library(poolr)
library(ComplexHeatmap)
Expand Down Expand Up @@ -33,14 +32,24 @@ compute_p_value <- function(k, n, r, samplingSize, primaryTumorVector) {



summary_df_filter <- readRDS("summary_df_filter_final.rds")
summary_df_filter_combined <- summary_df_filter %>%
mutate(group = ifelse(value %in% c(100, 1000), "100-1000",
ifelse(value == 10000, "10000", "50000")
summary_df_filter <- readRDS("~/work/ctc-data/Barcoding experiment/summary_df_filter_final.rds")
summary_df_filter2 <- readRDS("~/work/ctc-data/Barcoding experiment/summary_df_filter_rev_bc_ident.rds")

colnames(summary_df_filter)[6] <- "pt_complexity"
colnames(summary_df_filter2)[15] <- "tumor_sample"

summary_df_filter2$tumor_sample <- as.character(summary_df_filter2$tumor_sample)

summary_df_filter_combined <- bind_rows(summary_df_filter, summary_df_filter2)


summary_df_filter_combined <- summary_df_filter_combined %>%
mutate(group = ifelse(pt_complexity %in% c(100, 1000), "100-1000",
ifelse(pt_complexity == 10000, "10000", "50000")
))

write_csv(summary_df_filter_combined, file = "~/work/ctc-data/Barcoding experiment/summary_df_filter_final_all.csv")

setwd("~/Documents/projects/CTC_backup/validation_experiment/")


p_values_table <- data.frame(matrix(ncol = 0, nrow = 24))
Expand All @@ -49,13 +58,15 @@ mono_clusters <- data.frame(matrix(ncol = 0, nrow = 24))
expected_values_null_table <- data.frame(matrix(ncol = 0, nrow = 24))
fold_change_table <- data.frame(matrix(ncol = 0, nrow = 24))

mouse_models <- c("140", "141", "902", "903", "904", "905", "910")
mouse_models <- unique(summary_df_filter_combined$tumor_sample)

for (mouse_model in mouse_models) {
setwd("Cluster_csv_files")
setwd("~/work/ctc-data/Barcoding experiment/Cluster data")

files <- list.files(pattern = paste0("^.*_", mouse_model, ".*\\.csv$"))
print(files)
summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]
summary_temp <- summary_df_filter_combined %>% filter(tumor_sample == mouse_model)
# summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]


total_number_of_clusters_by_size <- summary_temp %>%
Expand All @@ -72,13 +83,13 @@ for (mouse_model in mouse_models) {
summary_clusters_by_size <- merge(data.frame(cluster_size = 2:25), summary_clusters_by_size, by.x = "cluster_size", all.x = TRUE)
summary_clusters_by_size[is.na(summary_clusters_by_size)] <- 0

setwd("../Primary_tumor_csv_files")
setwd("~/work/ctc-data/Barcoding experiment/primary_data")


primary_files <- list.files(pattern = paste0("^.*", mouse_model, ".*\\.csv$"))

print("Loading primary tumor data:")
primaryA <- read_delim(primary_files[1], col_names = F, delim = "\t")
primaryA <- read_delim(primary_files[1], col_names = FALSE, delim = "\t")


colnames(primaryA)[1] <- "barcodes"
Expand All @@ -105,7 +116,7 @@ for (mouse_model in mouse_models) {
}

print("Done!")
setwd("..")


primary_counts <- primary_counts %>%
group_by(barcodes) %>%
Expand Down Expand Up @@ -171,10 +182,10 @@ log_fold_change_table <- log(fold_change_table + 1)


frequency_table <- t(frequency_table[1:4, ])
frequency_table <- frequency_table[c(5, 6, 2, 1, 3, 4, 7), ]
frequency_table <- frequency_table[c(9, 11, 4, 3, 10, 12, 8, 5, 6, 2, 1, 7), ]

p_values_table <- t(p_values_table[1:4, ])
p_values_table <- p_values_table[c(5, 6, 2, 1, 3, 4, 7), ]
p_values_table <- p_values_table[c(9, 11, 4, 3, 10, 12, 8, 5, 6, 2, 1, 7), ]

fold_change_table[is.na(fold_change_table)] <- -1
log_fold_change_table[is.na(log_fold_change_table)] <- -1
Expand Down Expand Up @@ -235,11 +246,11 @@ ha_col <- HeatmapAnnotation(

ha_row <- rowAnnotation(
foo = anno_text(x = row_annotation, which = "row", show_name = TRUE, gp = gpar(fontsize = 8)),
annotation_label = c("Cluster size"), annotation_name_gp = gpar(fontsize = 10)
annotation_label = c(""),
annotation_name_gp = gpar(fontsize = 10)
)



ht <- Heatmap(frequency_table,
cell_fun = function(j, i, x, y, w, h, fill) {
if (!is.na(p_values_table[i, j])) {
Expand All @@ -260,15 +271,22 @@ ht <- Heatmap(frequency_table,
cluster_columns = FALSE,
col = col_fun,
heatmap_legend_param = list(
title = "% mono",
title = "Monoclonal fraction",
at = c(0, 0.5, 1), labels = c("0", "50", "100")
),
top_annotation = ha_col,
left_annotation = ha_row
)

lgd_sig <- Legend(pch = c("*", "**", "***", "-"), type = "points", labels = c("<0.05", "<0.01", "<0.001", "NA"), legend_gp = gpar(fontsize = 8))
draw(ht, annotation_legend_list = list(lgd_sig))

pdf("~/work/ctc-data/Barcoding experiment/Supplementary Figure 7b.pdf", width = 6, height = 4)
draw(ht, annotation_legend_list = list(lgd_sig), padding = unit(c(3, 1, 1, 1), "lines"))
pushViewport(viewport(layout = grid.layout(1, 1))) # Ensure single viewport
grid.text("CTC cluster size (n cells)", x = 0.35, y = unit(0.05, "npc"), gp = gpar(fontsize = 10))

dev.off()


##### Same only for fold-change

Expand Down

0 comments on commit 6b4f77d

Please sign in to comment.