Skip to content

Commit

Permalink
pre-commit run
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGawron committed Jun 10, 2024
1 parent beff467 commit 95d6057
Show file tree
Hide file tree
Showing 29 changed files with 2,044 additions and 7,464 deletions.
1 change: 1 addition & 0 deletions Rcode/.Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/renv/activate.R="42F1FF6D"
/Users/jgawron/Documents/projects/CTC_backup/validation_experiment/foldchange_in_barcode_prevalence.R="097865BC"
159 changes: 81 additions & 78 deletions Rcode/ComputeSplittingStatistics.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
source("functions.R")

############
#Config
# Config
############
inputFolder <- "../../input_folder"
treeName <- "LM2"
Expand All @@ -10,14 +10,14 @@ treeName <- "LM2"


############
#Data preprocessing
# Data preprocessing
############
input <- load_data(inputFolder, treeName)

postSampling <- input$postSampling
nClusters <- input$nClusters
ClusterID <- input$clusterID
nCells <- input$nCells
nCells <- input$nCells
nMutations <- input$nMutations
nClusters <- input$nClusters
alleleCount <- input$alleleCount
Expand All @@ -31,48 +31,49 @@ annotations <- input$annotations

totalReadCountVector <- totalReadCounts %>% unlist()

fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = 'log'))
fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = "log"))
fit2 <- glm.nb(totalReadCountVector ~ 1)
fit3 <- zeroinfl(totalReadCountVector ~1, dist = 'negbin')
fit4 <- zeroinfl(totalReadCountVector ~1, dist = 'poisson')
fit3 <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin")
fit4 <- zeroinfl(totalReadCountVector ~ 1, dist = "poisson")

summary(fit1)
summary(fit2)
exp(coef(fit))
coef(fit2)

coeficients <- exp(summary(fit3)$coefficients$count[,1])
coeficients <- exp(summary(fit3)$coefficients$count[, 1])

exp(coef(fit3))
summary(fit4)


simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0,
0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)))
0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))
)

sim <- data.frame(sim = vector(), run = vector())

for(i in 1:100){

for (i in 1:100) {
simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0,
0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)))

0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))
)

sim <- rbind(sim, data.frame(sim = simNew, run = i))
}


sim <- rbind(sim, data.frame(sim = totalReadCountVector, run = 0))

sim %>%
ggplot(aes(x = sim, group = run)) +
geom_histogram(data = sim[sim$run == 0,], alpha = 0.4, color = 'darkseagreen', fill = 'darkseagreen') +
geom_freqpoly(data = sim[sim$run != 0,], aes(x = sim), color = 'red', position = 'identity', alpha = 0.4)
ggplot(aes(x = sim, group = run)) +
geom_histogram(data = sim[sim$run == 0, ], alpha = 0.4, color = "darkseagreen", fill = "darkseagreen") +
geom_freqpoly(data = sim[sim$run != 0, ], aes(x = sim), color = "red", position = "identity", alpha = 0.4)




############
#Unit testing
# Unit testing
############


Expand All @@ -93,7 +94,7 @@ test_ComputePerMutationProbabilityOfPolyclonality()


############
#Main Analysis
# Main Analysis
############


Expand All @@ -103,7 +104,7 @@ mutationFilter <- apply(mutationDescription, 1, FUN = IsDriver, annotations)



readCounts <- read_delim('../../input_folder//LM2/LM2.txt', delim = '\t', col_names = FALSE)
readCounts <- read_delim("../../input_folder//LM2/LM2.txt", delim = "\t", col_names = FALSE)



Expand All @@ -122,49 +123,53 @@ print(candidate_pairs$full_distance_matrix)


splittingProbs <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells,
nMutations, nClusters,
alleleCount,
mutatedReadCounts, totalReadCounts,
nMutationSamplingEvents = 20, nTreeSamplingEvents = 20)
nMutations, nClusters,
alleleCount,
mutatedReadCounts, totalReadCounts,
nMutationSamplingEvents = 20, nTreeSamplingEvents = 20
)


splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability))
splittingProbs %>%
group_by(Cluster) %>%
summarize(meanSplittingProbability = mean(Splitting_probability))

## Go through all clusters and compare all pairs of cells within each cluster with
## each other. Note that the cells from the clusters are adjacent to each other by
## design, so incrementing the index j by 1 makes sense
distance <- vector()
clusterIdentityofdistance <- vector()
system.time(for (c in 1:nClusters){
cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is
system.time(for (c in 1:nClusters) {
cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is
## compatible with cpp
cluster_done <- 0
for(i in cellsInCluster){
if(cluster_done == 1){
for (i in cellsInCluster) {
if (cluster_done == 1) {
cluster_done <- 0
break
}
if(sampleDescription$WBC[i+1] == 1) next
if (sampleDescription$WBC[i + 1] == 1) next
j <- cellsInCluster[1]
while(j < i){
if(cluster_done == 1){
while (j < i) {
if (cluster_done == 1) {
break
}
if(sampleDescription$WBC[j+1] == 1){
if (sampleDescription$WBC[j + 1] == 1) {
j <- j + 1
next
}
print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " "))
distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells,
nMutations, nClusters,
alleleCount,sampleDescription$Cluster,
mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000))
clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1)
distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells,
nMutations, nClusters,
alleleCount, sampleDescription$Cluster,
mutatedReadCounts, totalReadCounts, sampleDescription$WBC,
nSamplingEvents = 1000
))
clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1)
j <- j + 1
cluster_done <- 1
}
}

})
#########

Expand All @@ -187,51 +192,53 @@ system.time(for (c in 1:nClusters){
## design, so incrementing the index j by 1 makes sense
distance <- vector()
clusterIdentityofdistance <- vector()
system.time(for (c in 1:nClusters){
cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is
## compatible with cpp
system.time(for (c in 1:nClusters) {
cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is
## compatible with cpp
cluster_done <- 0
for(i in cellsInCluster){
if(cluster_done == 1){
for (i in cellsInCluster) {
if (cluster_done == 1) {
cluster_done <- 0
break
}
if(sampleDescription$WBC[i+1] == 1) next
if (sampleDescription$WBC[i + 1] == 1) next
j <- cellsInCluster[1]
while(j < i){
if(cluster_done == 1){
while (j < i) {
if (cluster_done == 1) {
break
}
if(sampleDescription$WBC[j+1] == 1){
if (sampleDescription$WBC[j + 1] == 1) {
j <- j + 1
next
}
print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " "))
distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells,
nMutations, nClusters,
alleleCount,sampleDescription$Cluster,
mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000))
clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1)
distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells,
nMutations, nClusters,
alleleCount, sampleDescription$Cluster,
mutatedReadCounts, totalReadCounts, sampleDescription$WBC,
nSamplingEvents = 1000
))
clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1)
j <- j + 1
cluster_done <- 1
}
}

})
#########


intraClusterSplitMedianPlot <- ggplot(data.frame(Median_Distance = distance), aes(x = Median_Distance)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7)+
xlab("Median distance between of leaves within the same cluster") + ylab("total count") +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7) +
xlab("Median distance between of leaves within the same cluster") +
ylab("total count") +
ggtitle(treeName) +
labs(subtitle = "Histogram of similarities of cells within cluster",caption = "dashed red line indicates cutoff for oligoclonality") +
labs(subtitle = "Histogram of similarities of cells within cluster", caption = "dashed red line indicates cutoff for oligoclonality") +
theme_minimal() +
theme(
plot.title = element_text(size = 24, face = "bold"),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
plot.subtitle = element_text(size= 20),
plot.subtitle = element_text(size = 20),
axis.text = element_text(size = 16),
plot.caption = element_text(size = 14)
)
Expand All @@ -240,47 +247,43 @@ plot(intraClusterSplitMedianPlot)

summary(distance)
print(intraClusterSplitMedianPlot)
#####Manually adapt
##### Manually adapt
cutoffForOligoclonality <- 400


intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality,color = "red", linetype = "dashed", size = 1)
intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality, color = "red", linetype = "dashed", size = 1)



### Now look at each cluster and determine whether at least one pair of cells split
clusterSplits <- vector()

for (c in 1:nClusters){
cellPairsInCluster <- which(clusterIdentityofdistance == (c-1))
for (c in 1:nClusters) {
cellPairsInCluster <- which(clusterIdentityofdistance == (c - 1))
print(cellPairsInCluster)
print("Distances:")
print(distance[cellPairsInCluster])
if(length(cellPairsInCluster) == 0) next
else if (max(distance[cellPairsInCluster])>cutoffForOligoclonality){
clusterSplits <- c(clusterSplits,1)
}
else {
clusterSplits <- c(clusterSplits,0)

if (length(cellPairsInCluster) == 0) {
next
} else if (max(distance[cellPairsInCluster]) > cutoffForOligoclonality) {
clusterSplits <- c(clusterSplits, 1)
} else {
clusterSplits <- c(clusterSplits, 0)
}
}
which(ClusterID == 25)

produce_Distance_Posterior(35,36, postSampling, "LM2")
produce_Distance_Posterior(35, 36, postSampling, "LM2")

produce_Distance_Posterior(35,36, postSampling, "LM2")
produce_Distance_Posterior(35, 36, postSampling, "LM2")


#compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){
# compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){

# for (i in nrow(postSampling)){
# tree <- postSampling$Tree[i]
#tree <- postSampling$Tree[3200] ##Debugging

# }
#}



# tree <- postSampling$Tree[3200] ##Debugging

# }
# }
Loading

0 comments on commit 95d6057

Please sign in to comment.