Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix in extended_data_figure_7b.R and redoing extended_data_figure_7a.R #5

Merged
merged 1 commit into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21))
high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40))


# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
PieDonut(low, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(medium, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(high, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
PieDonut(low, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(medium, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)


# Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+"
Expand Down
177 changes: 124 additions & 53 deletions experiments/barcoding_experiment/extended_data_figure_7a.R
Original file line number Diff line number Diff line change
@@ -1,77 +1,148 @@
library(tidyverse)
library(ggplot2)
library(boot)
library(smoothr)

data <- readRDS("combi_df.rds")
View(data)
setwd("~/Downloads")

tables <- list()
filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds")
for (file in filelist) {
tables[[file]] <- readRDS(file)
tables[[file]]$observations <- paste0(file, tables[[file]]$observations)
first_row_with_nonzero_counts <- tables[[file]][tables[[file]]$freq != 0, ][1, ]
total_counts <- first_row_with_nonzero_counts$counts / first_row_with_nonzero_counts$freq
tables[[file]]$total_counts <- total_counts
cutoff <- quantile(tables[[file]]$prop_av, prob = 0.01)
tables[[file]] <- tables[[file]] %>%
filter(prop_av > cutoff) %>%
filter(prop_av != 0)
}

data <- data %>% mutate(counts / freq, complexity = as.numeric(complexity))

summary(data)
# Merge all tables
merged_table <- do.call(rbind, tables)
unique(merged_table$total_counts)

data2 <- data %>%
group_by(observations) %>%
slice(rep(1:n(), counts / freq)) %>%
mutate(present = row_number() <= first(counts)) %>%
ungroup()
merged_table <- merged_table %>% mutate(complexity = as.numeric(complexity))

fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = "log"))
summary(merged_table)

summary(fit)

confint(fit)
coef(fit)

data %>%
ggplot(aes(y = log(freq), x = log(prop_av))) +
geom_point() +
labs(x = "log(tumorVAF)", y = "log(fraction among CTC clusters)") +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2])
merged_table2 <- merged_table %>%
group_by(observations) %>%
slice(rep(1:n(), total_counts)) %>%
mutate(present = as.integer(row_number() <= first(counts))) %>%
ungroup()


data %>%
ggplot(aes(y = log(ratio), x = log(prop_av))) +
fit <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit1 <-
glm(
present ~ prop_av,
data = merged_table2, family = binomial(link = "logit")
)
fit2 <-
glm(
present ~ log(prop_av),
data = merged_table2, family = binomial(link = "log")
)
fit3 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "logit")
)
fit4 <-
glm(present ~ logit(prop_av),
data = merged_table2, family = binomial(link = "log")
)

AIC(fit)
AIC(fit1)
AIC(fit2)
AIC(fit3)
AIC(fit4)

### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x
## coordinates to a well defined range, so I will use fit3.
fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table)
AIC(fit_linear_model)

merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2] - 1) +
labs(x = "log(VAF in primary tumor)", y = "log(fold-change in prevalence)") +
theme_minimal()
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm", formula = y ~ x)

merged_table %>%
ggplot(aes(y = freq, x = log(prop_av))) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_smooth(method = "lm")


merged_table %>%
ggplot(aes(y = logit(freq), x = logit(prop_av))) +
geom_point() +
labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") +
geom_smooth(method = "lm", formula = y ~ x)

pdf("~/Desktop/clonal_prevalence.pdf", width = "3.6", height = "3.6")
par(bty = "l")
plot(
x = log(data$prop_av),
y = log(data$ratio),
cex = 0.8,
pch = 19,
col = "#81A9A9",
xlab = "log( VAF in primary tumor )",
ylab = "log( fold-change in prevalence in CTC clusters )"
)
abline(b = coef(fit)[2] - 1, a = coef(fit)[1], col = "#3C8181", lwd = 2.2)
text(x = max(log(data$prop_av)) - 5, y = max(log(data$ratio)) - 3.5, labels = sprintf("slope = %f***", coef(fit)[2] - 1), pos = 4, col = "black")
dev.off()

summary(fit)

#### The p-value is computed manually. Compare (Regression, Modelle, Methoden
## und Anwendungen, second edition, page 204f; https://www.uni-goettingen.de/de/551357.html)
fit_logit_logit_linear <-
lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table)
plot(fit_logit_logit_linear)
primary_VAF <- seq(0.01, 0.55, 0.01)

### The coefficient scaled by the standard error and then squared is
## approximately xi-square (1) distributed with one degree of freedom.
predicted <- predict(fit4,
newdata = data.frame(prop_av = primary_VAF),
type = "response"
)

### Also note that the number of data points 1798 is large enough (>50) for a
## xi-square approximation to work, according to rule of thumb.
color <- "#3C8181"

pseudo_t_value <- as.numeric(coef(fit)["log(prop_av)"] - 1) / as.numeric(summary(fit)$coefficients["log(prop_av)", "Std. Error"])
ggplot(merged_table2, aes(x = prop_av, y = present)) +
geom_jitter(width = 0, height = 0.1, color = color) +
geom_smooth(
method = "glm",
method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
) +
labs(
x = "Clonal frequency in primary tumor",
y = "P(barcode is present in CTC cluster)"
) +
geom_abline(linetype = "dotted", slope = 1, intercept = 0)

wald_statistics <- pseudo_t_value^2 ### xi^2_1 distributed

p.value <- 1 - pchisq(wald_statistics, 1)
print(p.value)
cor(log(data$prop_av), log(data$ratio), method = "pearson")
### The p-value is smaller that measurable by machine precision, so I suggest to report
### the smallest p-value that the generic glm summary can output:
merged_table %>%
ggplot(aes(y = freq, x = prop_av)) +
geom_point() +
labs(x = "tumor VAF", y = "fraction among CTC clusters") +
geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted))

means <- rep(NA, 100)
for (i in 1:25) {
window_start <- (i - 1) / 50
window_end <- i / 50
means[i] <- merged_table2 %>%
dplyr::filter(prop_av > window_start) %>%
dplyr::filter(prop_av <= window_end) %>%
pull(present) %>%
mean()
}

ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) +
geom_point()

ggsave(
"~/Desktop/clonal_prevalence2.pdf",
width = 3.6, height = 3.6, units = "in"
)

### p-value < 2.2e-16
cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman")
cor.test(
x = merged_table2$prop_av, y = merged_table2$present, method = "spearman"
)
52 changes: 5 additions & 47 deletions experiments/barcoding_experiment/extended_data_figure_7b.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## Author: Johannes Gawron
## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator

library(MCMCprecision)
library(ggplot2)
library(tidyverse)
Expand Down Expand Up @@ -54,17 +57,13 @@ for (mouse_model in mouse_models) {
print(files)
summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]

summary_df_filter_combined <- summary_temp %>%
mutate(group = ifelse(value %in% c(100, 1000), "100-1000",
ifelse(value == 10000, "10000", "50000")
))

total_number_of_clusters_by_size <- summary_df_filter_combined %>%
total_number_of_clusters_by_size <- summary_temp %>%
group_by(cluster_size) %>%
summarize(all_clusters = n())


number_of_monoclonal_cluster_by_size <- summary_df_filter_combined %>%
number_of_monoclonal_cluster_by_size <- summary_temp %>%
filter(clonality == "mono") %>%
group_by(cluster_size) %>%
summarize(monoclonal_clusters = n())
Expand Down Expand Up @@ -161,50 +160,9 @@ for (mouse_model in mouse_models) {


frequency_table <- mono_clusters / all_clusters
rownames(all_clusters) <- 2:25
rownames(mono_clusters) <- 2:25
rownames(frequency_table) <- 2:25
rownames(p_values_table) <- 2:25
rownames(fold_change_table) <- 2:25
rownames(expected_values_null_table) <- 2:25



expected_values_null_table[is.na(expected_values_null_table)] <- 0

expected_values_null_per_sample <- expected_values_null_table %>% apply(MARGIN = 2, FUN = sum)

expected_proportion_null_per_sample <- expected_values_null_per_sample / all_clusters %>% apply(MARGIN = 2, FUN = sum)

frequencies <- mono_clusters %>% apply(MARGIN = 2, FUN = sum) / all_clusters %>% apply(MARGIN = 2, FUN = sum)


annotations <- ifelse(p_values_columns < 0.001, "***",
ifelse(p_values_columns < 0.01, "**",
ifelse(p_values_columns < 0.05, "*", "ns")
)
)
annotations <- paste0(mono_clusters %>% apply(MARGIN = 2, FUN = sum), "/", all_clusters %>% apply(MARGIN = 2, FUN = sum), "\n", annotations)
annotation_data <- data.frame(SampleID = rownames(data), annotations = annotations, frequencies = frequencies, expected = expected_proportion_null_per_sample)

annotation_data$SampleID <- factor(annotation_data$SampleID, levels = annotation_data$SampleID[c(5, 6, 2, 1, 3, 4, 7)])


ggplot(annotation_data, aes(x = SampleID, y = frequencies)) +
ylim(0, 1.1) +
geom_bar(stat = "identity", fill = "#598F8F", alpha = 0.8, width = 0.8, position = position_dodge(width = 0.6)) +
geom_text(
data = annotation_data, aes(x = SampleID, y = frequencies, label = annotations),
position = position_dodge(width = 0.6), vjust = -0.5, size = 4
) +
labs(x = "Sample ID", y = "Proportion of monoclonal Clusters") +
theme_classic() +
scale_x_discrete(limits = levels(annotation_data$SampleID)) +
theme(
axis.text = element_text(size = 14),
axis.title = element_text(size = 14)
) +
geom_point(aes(y = expected), shape = 23, fill = "black", size = 4)



Expand Down