pre-commit run

cbg-ethz · Jun 10, 2024 · 95d6057 · 95d6057
1 parent beff467
commit 95d6057
Show file tree

Hide file tree

Showing 29 changed files with 2,044 additions and 7,464 deletions.
diff --git a/Rcode/.Rproj.user/shared/notebooks/paths b/Rcode/.Rproj.user/shared/notebooks/paths
@@ -1 +1,2 @@
+/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/renv/activate.R="42F1FF6D"
 /Users/jgawron/Documents/projects/CTC_backup/validation_experiment/foldchange_in_barcode_prevalence.R="097865BC"
diff --git a/Rcode/ComputeSplittingStatistics.R b/Rcode/ComputeSplittingStatistics.R
@@ -1,7 +1,7 @@
 source("functions.R")
 
 ############
-#Config
+# Config
 ############
 inputFolder <- "../../input_folder"
 treeName <- "LM2"
@@ -10,14 +10,14 @@ treeName <- "LM2"
 
 
 ############
-#Data preprocessing
+# Data preprocessing
 ############
 input <- load_data(inputFolder, treeName)
 
 postSampling <- input$postSampling
 nClusters <- input$nClusters
 ClusterID <- input$clusterID
-nCells <- input$nCells  
+nCells <- input$nCells
 nMutations <- input$nMutations
 nClusters <- input$nClusters
 alleleCount <- input$alleleCount
@@ -31,48 +31,49 @@ annotations <- input$annotations
 
 totalReadCountVector <- totalReadCounts %>% unlist()
 
-fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = 'log'))
+fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = "log"))
 fit2 <- glm.nb(totalReadCountVector ~ 1)
-fit3 <- zeroinfl(totalReadCountVector ~1, dist = 'negbin')
-fit4 <- zeroinfl(totalReadCountVector ~1, dist = 'poisson')
+fit3 <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin")
+fit4 <- zeroinfl(totalReadCountVector ~ 1, dist = "poisson")
 
 summary(fit1)
 summary(fit2)
 exp(coef(fit))
 coef(fit2)
 
-coeficients <- exp(summary(fit3)$coefficients$count[,1])
+coeficients <- exp(summary(fit3)$coefficients$count[, 1])
 
 exp(coef(fit3))
 summary(fit4)
 
 
 simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0,
-                 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)))
+  0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))
+)
 
 sim <- data.frame(sim = vector(), run = vector())
 
-for(i in 1:100){
-
+for (i in 1:100) {
   simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0,
-                   0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)))
-
+    0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))
+  )
+
   sim <- rbind(sim, data.frame(sim = simNew, run = i))
 }
 
 
 sim <- rbind(sim, data.frame(sim = totalReadCountVector, run = 0))
 
 sim %>%
-  ggplot(aes(x = sim, group = run)) + 
-  geom_histogram(data = sim[sim$run == 0,], alpha = 0.4, color = 'darkseagreen', fill = 'darkseagreen') +
-  geom_freqpoly(data = sim[sim$run != 0,], aes(x = sim), color = 'red', position = 'identity', alpha = 0.4)
+  ggplot(aes(x = sim, group = run)) +
+  geom_histogram(data = sim[sim$run == 0, ], alpha = 0.4, color = "darkseagreen", fill = "darkseagreen") +
+  geom_freqpoly(data = sim[sim$run != 0, ], aes(x = sim), color = "red", position = "identity", alpha = 0.4)
 
 
 
 
 ############
-#Unit testing
+# Unit testing
 ############
 
 
@@ -93,7 +94,7 @@ test_ComputePerMutationProbabilityOfPolyclonality()
 
 
 ############
-#Main Analysis
+# Main Analysis
 ############
 
 
@@ -103,7 +104,7 @@ mutationFilter <- apply(mutationDescription, 1, FUN = IsDriver, annotations)
 
 
 
-readCounts <- read_delim('../../input_folder//LM2/LM2.txt', delim = '\t', col_names = FALSE)
+readCounts <- read_delim("../../input_folder//LM2/LM2.txt", delim = "\t", col_names = FALSE)
 
 
 
@@ -122,49 +123,53 @@ print(candidate_pairs$full_distance_matrix)
 
 
 splittingProbs <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells,
-                     nMutations, nClusters,
-                     alleleCount,
-                     mutatedReadCounts, totalReadCounts,
-                     nMutationSamplingEvents = 20, nTreeSamplingEvents = 20)
+  nMutations, nClusters,
+  alleleCount,
+  mutatedReadCounts, totalReadCounts,
+  nMutationSamplingEvents = 20, nTreeSamplingEvents = 20
+)
 
 
-splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability))
+splittingProbs %>%
+  group_by(Cluster) %>%
+  summarize(meanSplittingProbability = mean(Splitting_probability))
 
 ## Go through all clusters and compare all pairs of cells within each cluster with
 ## each other. Note that the cells from the clusters are adjacent to each other by
 ## design, so incrementing the index j by 1 makes sense
 distance <- vector()
 clusterIdentityofdistance <- vector()
-system.time(for (c in 1:nClusters){
-  cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is 
+system.time(for (c in 1:nClusters) {
+  cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is
   ## compatible with cpp
   cluster_done <- 0
-  for(i in cellsInCluster){
-    if(cluster_done == 1){
+  for (i in cellsInCluster) {
+    if (cluster_done == 1) {
       cluster_done <- 0
       break
     }
-    if(sampleDescription$WBC[i+1] == 1) next
+    if (sampleDescription$WBC[i + 1] == 1) next
     j <- cellsInCluster[1]
-    while(j < i){
-      if(cluster_done == 1){
+    while (j < i) {
+      if (cluster_done == 1) {
         break
       }
-      if(sampleDescription$WBC[j+1] == 1){
+      if (sampleDescription$WBC[j + 1] == 1) {
         j <- j + 1
         next
       }
       print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " "))
-      distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells,
-                                                         nMutations, nClusters,
-                                                         alleleCount,sampleDescription$Cluster,
-                                                         mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000))
-      clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1)
+      distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells,
+        nMutations, nClusters,
+        alleleCount, sampleDescription$Cluster,
+        mutatedReadCounts, totalReadCounts, sampleDescription$WBC,
+        nSamplingEvents = 1000
+      ))
+      clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1)
       j <- j + 1
       cluster_done <- 1
     }
   }
-
 })
 #########
 
@@ -187,51 +192,53 @@ system.time(for (c in 1:nClusters){
 ## design, so incrementing the index j by 1 makes sense
 distance <- vector()
 clusterIdentityofdistance <- vector()
-system.time(for (c in 1:nClusters){
-  cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is 
-                                                ## compatible with cpp
+system.time(for (c in 1:nClusters) {
+  cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is
+  ## compatible with cpp
   cluster_done <- 0
-  for(i in cellsInCluster){
-    if(cluster_done == 1){
+  for (i in cellsInCluster) {
+    if (cluster_done == 1) {
       cluster_done <- 0
       break
     }
-    if(sampleDescription$WBC[i+1] == 1) next
+    if (sampleDescription$WBC[i + 1] == 1) next
     j <- cellsInCluster[1]
-    while(j < i){
-      if(cluster_done == 1){
+    while (j < i) {
+      if (cluster_done == 1) {
         break
       }
-      if(sampleDescription$WBC[j+1] == 1){
+      if (sampleDescription$WBC[j + 1] == 1) {
         j <- j + 1
         next
       }
       print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " "))
-      distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells,
-                                                         nMutations, nClusters,
-                                                         alleleCount,sampleDescription$Cluster,
-                                                         mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000))
-      clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1)
+      distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells,
+        nMutations, nClusters,
+        alleleCount, sampleDescription$Cluster,
+        mutatedReadCounts, totalReadCounts, sampleDescription$WBC,
+        nSamplingEvents = 1000
+      ))
+      clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1)
       j <- j + 1
       cluster_done <- 1
     }
   }
-
 })
 #########
 
 
 intraClusterSplitMedianPlot <- ggplot(data.frame(Median_Distance = distance), aes(x = Median_Distance)) +
-  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7)+ 
-  xlab("Median distance between of leaves within the same cluster") + ylab("total count") +
+  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7) +
+  xlab("Median distance between of leaves within the same cluster") +
+  ylab("total count") +
   ggtitle(treeName) +
-  labs(subtitle = "Histogram of similarities of cells within cluster",caption = "dashed red line indicates cutoff for oligoclonality") +
+  labs(subtitle = "Histogram of similarities of cells within cluster", caption = "dashed red line indicates cutoff for oligoclonality") +
   theme_minimal() +
   theme(
     plot.title = element_text(size = 24, face = "bold"),
     axis.title.x = element_text(size = 20),
     axis.title.y = element_text(size = 20),
-    plot.subtitle = element_text(size= 20),
+    plot.subtitle = element_text(size = 20),
     axis.text = element_text(size = 16),
     plot.caption = element_text(size = 14)
   )
@@ -240,47 +247,43 @@ plot(intraClusterSplitMedianPlot)
 
 summary(distance)
 print(intraClusterSplitMedianPlot)
-#####Manually adapt
+##### Manually adapt
 cutoffForOligoclonality <- 400
 
 
-intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality,color = "red", linetype = "dashed", size = 1)
+intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality, color = "red", linetype = "dashed", size = 1)
 
 
 
 ### Now look at each cluster and determine whether at least one pair of cells split
 clusterSplits <- vector()
 
-for (c in 1:nClusters){
-  cellPairsInCluster <- which(clusterIdentityofdistance == (c-1))
+for (c in 1:nClusters) {
+  cellPairsInCluster <- which(clusterIdentityofdistance == (c - 1))
   print(cellPairsInCluster)
   print("Distances:")
   print(distance[cellPairsInCluster])
-  
-  if(length(cellPairsInCluster) == 0) next
-  else if (max(distance[cellPairsInCluster])>cutoffForOligoclonality){
-    clusterSplits <- c(clusterSplits,1)
-  }
-  else {
-    clusterSplits <- c(clusterSplits,0)
+
+  if (length(cellPairsInCluster) == 0) {
+    next
+  } else if (max(distance[cellPairsInCluster]) > cutoffForOligoclonality) {
+    clusterSplits <- c(clusterSplits, 1)
+  } else {
+    clusterSplits <- c(clusterSplits, 0)
   }
 }
 which(ClusterID == 25)
 
-produce_Distance_Posterior(35,36, postSampling, "LM2")
+produce_Distance_Posterior(35, 36, postSampling, "LM2")
 
-produce_Distance_Posterior(35,36, postSampling, "LM2")
+produce_Distance_Posterior(35, 36, postSampling, "LM2")
 
 
-#compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){
-  
+# compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){
+
 #  for (i in nrow(postSampling)){
 #    tree <- postSampling$Tree[i]
-    #tree <- postSampling$Tree[3200] ##Debugging
-
-#  }  
-#}
-
-
-
+# tree <- postSampling$Tree[3200] ##Debugging
 
+#  }
+# }
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/renv/activate.R="42F1FF6D"
		/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/foldchange_in_barcode_prevalence.R="097865BC"