diff --git a/experiments/assessing_cluster_clonality/workflow/Snakefile b/experiments/assessing_cluster_clonality/workflow/Snakefile
index 18d4761..cbde19e 100644
--- a/experiments/assessing_cluster_clonality/workflow/Snakefile
+++ b/experiments/assessing_cluster_clonality/workflow/Snakefile
@@ -17,7 +17,6 @@ MARKDOWNS = PROJECT_DIR / "data" / "markdowns"
 
 
 ######Rules######
-include: "rules/common.smk"
 include: "rules/base.smk"
 
 
diff --git a/experiments/assessing_cluster_clonality/workflow/envs/R.yml b/experiments/assessing_cluster_clonality/workflow/envs/R.yml
index c6715dc..927420c 100644
--- a/experiments/assessing_cluster_clonality/workflow/envs/R.yml
+++ b/experiments/assessing_cluster_clonality/workflow/envs/R.yml
@@ -9,3 +9,7 @@ dependencies:
   - r-tidyverse>=2.0
   - pandoc>=3.1
   - r-heatmaply>=1.5  
+  - r-optparse>=1.7
+  - r-viridis>=0.6
+  - r-vgam>=1.1
+  - r-pscl>=1.5
diff --git a/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R b/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R
index c77849e..2f87b94 100755
--- a/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R
+++ b/experiments/assessing_cluster_clonality/workflow/resources/annotateVariants.R
@@ -11,12 +11,12 @@ annotate_variants <- function(sampleName, inputFolder, variantList) {
 
 
   # Read VCF file to extract column names
-  file <- file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf"))
+  file <- file.path(inputFolder, "annotations", paste0(sampleName, ".ann.vcf"))
   lines <- readLines(file, warn = FALSE)
   vcf_names <- strsplit(lines[grep("^#CHROM", lines)], "\t")[[1]]
 
   # Read VCF file into a data frame
-  vcf <- read.table(file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf")),
+  vcf <- read.table(file.path(inputFolder, "annotations", paste0(sampleName, ".ann.vcf")),
     comment.char = "#", sep = "\t", header = FALSE, col.names = vcf_names
   )
   colnames(vcf)[1] <- "#CHROM"
diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R b/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R
index d496c50..e668609 100644
--- a/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R
+++ b/experiments/assessing_cluster_clonality/workflow/scripts/createInputSummary.R
@@ -2,7 +2,7 @@ source("../resources/functions.R")
 library("optparse")
 
 parser <- OptionParser()
-parser <- add_option(parser, c("-i", "--input-file"),
+parser <- add_option(parser, c("-i", "--input-folder"),
   type = "character",
   default = "~/Documents/projects/CTC_backup/input_folder", help = "Path to the folder containing all input files"
 )
@@ -10,22 +10,23 @@ parser <- add_option(parser, c("-n", "--name-of-tree"),
   type = "character",
   default = "Br23", help = "Name of the tree for which to simulate CTC-clusters"
 )
-args <- parse_args(parser, args = c("--input-file", "--name-of-tree"))
+args <- parse_args(parser)
 
 
 
 
-inputFolder <- dirname(args$"input-file")
-treeName <- args$name_of_tree
+input_folder <- args$"input-folder"
+tree_name <- args$"name-of-tree"
 
 
-# inputFolder <- "~/Documents/projects/CTC_backup/input_folder"
-# treeName <- "Br23"
 
+# input_folder <- "~/Documents/projects/CTC_backup/input_folder"
+# tree_name <- "Br23"
 
-input <- load_data(inputFolder, treeName)
 
-allClusterSizes <- input$sample_description %>%
+input <- load_data(input_folder, tree_name)
+
+all_cluster_sizes <- input$sample_description %>%
   filter(WBC == 0 & color != "gray93") %>%
   group_by(color) %>%
   filter(n() > 1) %>%
@@ -33,4 +34,5 @@ allClusterSizes <- input$sample_description %>%
   dplyr::select("cluster_size") %>%
   unique()
 
-write_csv(allClusterSizes, file.path(inputFolder, treeName, paste(treeName, "clusterSizes.csv", sep = "_")))
+write_csv(all_cluster_sizes, file.path(input_folder, tree_name, paste(tree_name, "clusterSizes.csv", sep = "_")))
+print("Success.")
\ No newline at end of file
diff --git a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
index 2b420e6..e9c3a2c 100644
--- a/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
+++ b/experiments/assessing_cluster_clonality/workflow/scripts/simulateCTCclusters.R
@@ -1,17 +1,27 @@
-source("functions.R")
 library(viridis)
 library(VGAM)
 library(pscl)
 library(MASS)
 library(boot)
+source("functions.R")
 library("optparse")
 
 ############
 # Config
 ############
 
+
+color_palette <-
+  list(
+    "orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid",
+    "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple",
+    "purple1", "purple2", "purple3", "purple4"
+  )
+
+
+
 parser <- OptionParser()
-parser <- add_option(parser, c("-i", "--input-file"),
+parser <- add_option(parser, c("-i", "--input-folder"),
   type = "character",
   default = "~/Documents/projects/CTC_backup/input_folder", help = "Path to the folder containing all input files"
 )
@@ -20,27 +30,33 @@ parser <- add_option(parser, c("-n", "--name-of-tree"),
   default = "Br23", help = "Name of the tree for which to simulate CTC-clusters"
 )
 parser <- add_option(parser, c("-s", "--simulation-cluster-size"),
-  type = "character",
+  type = "numeric",
   default = "2", help = "Number of cells in the simulated clusters"
 )
 parser <- add_option(parser, c("-o", "--output-folder"),
   type = "character",
   default = "~/Documents/projects/CTC_backup/simulations/simulation2", help = ""
 )
-args <- parse_args(parser, args = c("--input-folder", "--name-of-tree", "--simulation-cluster-size", "--output-folder"))
-
-
-inputFolder <- dirname(args$input - file)
-treeName <- args$name_of_tree
-clusterSize <- args$simulation - cluster - size
-outputFolder <- args$output - folder
+parser <- add_option(parser, c("-m", "--monoclonal"),
+  type = "logical",
+  default = TRUE, help = ""
+)
 
-# inputFolder <- "~/Documents/projects/CTC_backup/input_folder"
-# treeName <- "Br23"
+args <- parse_args(parser)
 
 
-input <- load_data(inputFolder, treeName)
+input_folder <- args$"input-folder"
+tree_name <- args$"name-of-tree"
+cluster_size <- args$"simulation-cluster-size"
+output_folder <- args$"output-folder"
+monoclonal <- args$monoclonal
 
+# input_folder <- "~/Documents/projects/CTC_backup/input_folder"
+# tree_name <- "Br23"
+print(input_folder)
+print(tree_name)
+input <- load_data(input_folder, tree_name)
+print("Input data successfully loaded.")
 
 #
 # ############
@@ -48,7 +64,7 @@ input <- load_data(inputFolder, treeName)
 # ############
 #
 #
-# # input <- load_data(inputFolder, treeName)
+# # input <- load_data(input_folder, tree_name)
 # # totalReadCounts <- input$totalReadCounts
 # # sampleDescription <- input$sample_description
 #
@@ -112,14 +128,16 @@ input <- load_data(inputFolder, treeName)
 
 
 
-#' Fits a zero inflated negative binomial distribution to the total read count data.
+#' Fits a zero inflated negative binomial distribution
+#' to the total read count data.
 #'
 #' @param input The loaded dataset
-#' @param zeroInfl If this boolean value is FALSE, then a negative binomial is fit to the data
+#' @param zeroInfl If this boolean value is FALSE,
+#' then a negative binomial is fit to the data
 #'
 #'
-#' @return The parameters of the distribution. If zeroInfl is false, then the zero probability
-#' is set to 0.
+#' @return The parameters of the distribution. If zeroInfl is false,
+#' then the zero probability is set to 0.
 #' @export
 #'
 #' @examples
@@ -130,10 +148,18 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE) {
 
   if (zeroInfl == TRUE) {
     fit <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin")
-    return(list(zeroProb = inv.logit(summary(fit)$coefficients$zero[1]), theta = exp(summary(fit)$coefficients$count[2, 1]), expValue = exp(summary(fit)$coefficients$count[1, 1])))
+    return(
+      list(
+        zeroProb = inv.logit(summary(fit)$coefficients$zero[1]),
+        theta = exp(summary(fit)$coefficients$count[2, 1]),
+        expValue = exp(summary(fit)$coefficients$count[1, 1])
+      )
+    )
   } else {
     fit <- glm.nb(totalReadCountVector ~ 1)
-    return(list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit))))
+    return(
+      list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit)))
+    )
   }
 }
 
@@ -167,37 +193,42 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE) {
 #' @export
 #'
 #' @examples
-simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorRate, readCountFit) {
-  # draw from a binomial model to simulate dropouts
-  nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles, prob = (1 - dropoutRate))
-  nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1 - dropoutRate))
+simulateReads <-
+  function(
+      nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorRate, readCountFit) {
+    # draw from a binomial model to simulate dropouts
+    nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles, prob = (1 - dropoutRate))
+    nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1 - dropoutRate))
 
 
 
 
-  # draw from a negative-binomial to simulate the total read count
-  isZero <- rbinom(1, size = 1, p = readCountFit$zeroProb)
-  if (isZero == TRUE) {
-    nReads <- 0
-  } else {
-    nReads <- rnegbin(1, mu = readCountFit$expValue, theta = readCountFit$theta)
-  }
+    # draw from a negative-binomial to simulate the total read count
+    isZero <- rbinom(1, size = 1, p = readCountFit$zeroProb)
+    if (isZero == TRUE) {
+      nReads <- 0
+    } else {
+      nReads <- rnegbin(1, mu = readCountFit$expValue, theta = readCountFit$theta)
+    }
 
 
-  # draw from a beta-binomial to simulate overdispersion through multiple-
-  # displacement amplification
-  nWildtypeReads <- rbetabinom.ab(n = 1, size = nReads, shape1 = nWildtypeAlleles, shape2 = nMutatedAlleles)
+    # draw from a beta-binomial to simulate overdispersion through multiple-
+    # displacement amplification
+    nWildtypeReads <-
+      rbetabinom.ab(
+        n = 1, size = nReads, shape1 = nWildtypeAlleles, shape2 = nMutatedAlleles
+      )
 
-  nMutatedReads <- nReads - nWildtypeReads
+    nMutatedReads <- nReads - nWildtypeReads
 
-  # randomly flip the genotypes of reads with a certain error rate
-  falsePositives <- rbinom(1, size = nReads - nMutatedReads, prob = errorRate)
-  falseNegatives <- rbinom(1, size = nMutatedReads, prob = errorRate)
+    # randomly flip the genotypes of reads with a certain error rate
+    falsePositives <- rbinom(1, size = nReads - nMutatedReads, prob = errorRate)
+    falseNegatives <- rbinom(1, size = nMutatedReads, prob = errorRate)
 
-  nMutatedReads <- nMutatedReads + falsePositives - falseNegatives
+    nMutatedReads <- nMutatedReads + falsePositives - falseNegatives
 
-  return(list(read_counts = c(nReads, nMutatedReads)))
-}
+    return(list(read_counts = c(nReads, nMutatedReads)))
+  }
 
 
 
@@ -208,7 +239,8 @@ simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorR
 
 #' Calls genotypes of single cells based on the CTC-SCITE algorithm
 #'
-#' @param nTreeSamplingEvents number of sampled trees. Appricimated postserior gets better the higher this number is.
+#' @param nTreeSamplingEvents number of sampled trees. Approximated posterior
+#' gets better the higher this number is.
 #' @param input The loaded data.
 #'
 #' @return returns a data frame in long format that gives the genotype and the
@@ -216,7 +248,7 @@ simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorR
 #' @export
 #'
 #' @examples
-getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
+call_genotypes <- function(nTreeSamplingEvents = 1000, input) {
   postSampling <- input$postSampling
   nCells <- input$nCells
   nMutations <- input$nMutations
@@ -227,7 +259,12 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
   totalReadCounts <- input$totalReadCounts
 
 
-  desired_values <- sample(1:length(postSampling), size = nTreeSamplingEvents, replace = FALSE) %>% sort()
+  desired_values <-
+    sample(
+      1:length(postSampling),
+      size = nTreeSamplingEvents, replace = FALSE
+    ) %>%
+    sort()
   postSampling <- postSampling[desired_values]
   postSamplingTrees <- lapply(postSampling, FUN = function(entry) {
     return(entry$Tree)
@@ -254,7 +291,10 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
     geom_tile(aes(fill = Posterior)) +
     scale_fill_viridis()
 
-  genotypes$WBC <- input$sample_description$WBC[(genotypes$Sample %>% substr(start = 2, stop = nchar(.)) %>% as.numeric())]
+  genotypes$WBC <-
+    input$sample_description$WBC[
+      (genotypes$Sample %>% substr(start = 2, stop = nchar(.)) %>% as.numeric())
+    ]
 
   genotypes %>%
     mutate(WBC = as.factor(WBC)) %>%
@@ -264,11 +304,23 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
 
 
 
-  genotypes <- genotypes %>% mutate(Mutation = as.numeric(Mutation), Genotype = as.integer(Posterior > 0.5))
+  genotypes <-
+    genotypes %>%
+    mutate(
+      Mutation = as.numeric(Mutation), Genotype = as.integer(Posterior > 0.5)
+    )
 
   genotypes %>%
-    filter(Sample %in%
-      paste0("X", which(input$sample_description$single_cell == TRUE & input$sample_description$WBC == FALSE))) %>%
+    filter(
+      Sample %in%
+        paste0(
+          "X",
+          which(
+            input$sample_description$single_cell == TRUE &
+              input$sample_description$WBC == FALSE
+          )
+        )
+    ) %>%
     ggplot(aes(Mutation, Sample)) +
     geom_tile(aes(fill = Genotype)) +
     scale_fill_viridis()
@@ -277,6 +329,292 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
 }
 
 
+#' Samples a a specified number of genotypes
+#'
+#' @param input the generic input data from the CTC-SCITE tree sampling
+#' @param genotypes called genotypes for each cell in long format
+#' @param sampling_size a vector that indicates how many
+#'
+#' @return
+#' @export
+#'
+#' @examples
+sample_genotypes <- function(input, genotypes, sampling_size) {
+  cellIDs <- paste0("X", 1:nrow(input$sample_description))
+
+  if (sampling_size > length(unique(genotypes$Sample))) {
+    cells <-
+      sample(
+        size = length(unique(genotypes$Sample)), x = cellIDs, replace = FALSE
+      )
+    stop("You want to sample more genotypes than can be provided")
+  } else {
+    cells <- sample(size = sampling_size, x = cellIDs, replace = FALSE)
+  }
+  return(cells)
+}
+
+
+
+#' Appends the simulated data to the original data and writes to new files
+#'
+#' @param output_directory
+#' @param input The generic tree sampling data
+#' @param output_label a name for the simulated dataset files, e.g. the number
+#' of clusters
+#' @param simulated_sample_description The lines for the sample description file
+#' adding the simulated CTC clusters
+#' @param genotypes_output_format The read counts for the simulated data
+#'
+#' @return No return, but writes the files to disk
+#' @export
+#'
+#' @examples
+create_simulated_output <-
+  function(output_directory, input, output_label, simulated_sample_description,
+           genotypes_output_format) {
+    print("Writing output files")
+
+    dir.create(
+      file.path(
+        output_directory, paste(input$sampleName, output_label, sep = "_")
+      ),
+      recursive = TRUE
+    )
+    description_data <-
+      read_delim(
+        file.path(
+          input$directory,
+          input$sampleName,
+          paste0(input$sampleName, "_samples_nodeDescription.tsv")
+        ),
+        delim = "\t",
+        col_names = FALSE,
+        quote = "none"
+      )
+    colnames(description_data) <-
+      c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description")
+
+    description_data <-
+      rbind(description_data, simulated_sample_description)
+    write_delim(
+      x = description_data,
+      file = file.path(
+        output_directory,
+        paste(input$sampleName, output_label, sep = "_"),
+        paste0(
+          input$sampleName, "_",
+          output_label,
+          "_samples_nodeDescription.tsv"
+        )
+      ),
+      delim = "\t",
+      col_names = FALSE,
+      quote = "none",
+      escape = "none"
+    )
+
+    read_data <-
+      read_delim(
+        file.path(
+          input$directory,
+          input$sampleName,
+          paste0(input$sampleName, ".txt")
+        ),
+        delim = "\t",
+        col_names = FALSE,
+        escape_backslash = TRUE
+      )
+
+    read_data <- cbind(read_data, genotypes_output_format)
+
+    write_delim(
+      x = read_data,
+      file = file.path(
+        output_directory,
+        paste(input$sampleName, output_label, sep = "_"),
+        paste0(input$sampleName, "_", output_label, ".txt")
+      ),
+      delim = "\t",
+      col_names = FALSE,
+      quote = "none",
+      escape = "none"
+    )
+  }
+
+
+#' Create input files for the CTC SCITE algorithm with
+#' one simulated oligoclonal cluster
+#'
+#' @param input The generic posterior sampling from CTC-SCITE
+#' @param number_of_cells The size of the output CTC-cluster
+#' @param output_directory The directory to write the output to
+#'
+#' @return
+#' @export
+#'
+#' @examples
+simulate_oligoclonals <- function(input, output_directory, number_of_cells, sampling_size = 100) {
+  read_data <-
+    read_delim(
+      file.path(
+        input$directory,
+        input$sampleName,
+        paste0(input$sampleName, ".txt")
+      ),
+      delim = "\t",
+      col_names = FALSE,
+      escape_backslash = TRUE
+    )
+
+  description_data <-
+    read_delim(
+      file.path(
+        input$directory,
+        input$sampleName,
+        paste0(input$sampleName, "_samples_nodeDescription.tsv")
+      ),
+      delim = "\t",
+      col_names = FALSE,
+      quote = "none"
+    )
+  colnames(description_data) <-
+    c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description")
+
+
+
+
+  cluster_identity_of_cells <- c()
+  idx <- 1
+  for (cell_count in description_data$total_number_cells) {
+    for (i in 1:cell_count) {
+      cluster_identity_of_cells <- c(cluster_identity_of_cells, idx)
+    }
+    idx <- idx + 1
+  }
+
+
+
+  genotypes <- call_genotypes(nTreeSamplingEvents = sampling_size, input = input)
+  genotypes_wide <-
+    genotypes %>%
+    dplyr::select(c(Sample, Genotype, Mutation)) %>%
+    pivot_wider(names_from = "Sample", values_from = Genotype)
+
+  rownames(genotypes_wide) <- genotypes_wide$Mutation
+  genotypes_wide <- genotypes_wide[, 2:ncol(genotypes_wide)]
+
+
+  single_cell_indices <- which(description_data$tumor_cells == 1 & description_data$total_number_cells == 1)
+
+  hamming_distance <- function(x, y) {
+    return(sum(x != y))
+  }
+
+  while (TRUE) { # I sample until I get a cluster where at least two cells are distinct
+    # Sample cells to merge
+    cells_to_merge <-
+      sample(single_cell_indices, number_of_cells, replace = FALSE)
+    cell_identity <- which(cluster_identity_of_cells %in% cells_to_merge)
+
+    ## Check if all cells have the same genotype:
+    ## If not, the cluster is oligoclonal and we are fine.
+
+    sum_of_distances <- 0
+    for (j in 1:length(cell_identity)) {
+      for (k in 1:length(cell_identity)) {
+        sum_of_distances <-
+          sum_of_distances +
+          hamming_distance(
+            genotypes_wide[, cell_identity[j]],
+            genotypes_wide[, cell_identity[k]]
+          )
+      }
+    }
+    if (sum_of_distances > 0) {
+      break
+    }
+  }
+
+
+  aggregated_data_ref <- rep(0, dim(read_data)[1])
+  aggregated_data_alt <- rep(0, dim(read_data)[1])
+
+  for (cell in cells_to_merge) {
+    aggregated_data_ref <- aggregated_data_ref + read_data[, 4 + 2 * cell - 1]
+    aggregated_data_alt <- aggregated_data_alt + read_data[, 4 + 2 * cell]
+  }
+
+  columns_to_remove <- c(4 + 2 * cells_to_merge - 1, 4 + 2 * cells_to_merge)
+
+  read_data <- cbind(read_data, aggregated_data_ref, aggregated_data_alt)
+  read_data <- read_data[, -columns_to_remove]
+
+  output_label <- paste(cells_to_merge, collapse = "_")
+
+  newSample <- data.frame(
+    sample_name = paste(input$sampleName, "sim", output_label, sep = "_"),
+    total_number_cells = number_of_cells, tumor_cells = number_of_cells,
+    WBCs = 0,
+    description =
+      paste0(
+        "[color=", color_palette[[1]],
+        ',label="', input$sampleName, "_sim",
+        '",fillcolor=',
+        color_palette[[1]],
+        ',image="../CTC-cluster-icons/cluster_',
+        number_of_cells,
+        '-0.png"]'
+      )
+  )
+
+  description_data_output_format <-
+    rbind(description_data, newSample)
+  description_data_output_format <-
+    description_data_output_format[-cells_to_merge, ]
+
+
+
+  dir.create(
+    file.path(
+      output_directory, paste(input$sampleName, output_label, sep = "_")
+    ),
+    recursive = TRUE
+  )
+
+  write_delim(
+    x = read_data,
+    file = file.path(
+      output_directory,
+      paste(input$sampleName, output_label, sep = "_"),
+      paste0(input$sampleName, "_", output_label, ".txt")
+    ),
+    delim = "\t",
+    col_names = FALSE,
+    quote = "none",
+    escape = "none"
+  )
+
+  write_delim(
+    x = description_data,
+    file = file.path(
+      output_directory,
+      paste(input$sampleName, output_label, sep = "_"),
+      paste0(
+        input$sampleName, "_",
+        output_label,
+        "_samples_nodeDescription.tsv"
+      )
+    ),
+    delim = "\t",
+    col_names = FALSE,
+    quote = "none",
+    escape = "none"
+  )
+}
+
+
+
 
 #' Creates the input dataset for CTC-SCITE run with simulated CTC-clusters.
 #'
@@ -293,12 +631,13 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
 #'
 #'
 #' @param samplingSize number of trees to determine the genotype of individual cells.
-#' To be passed to getGenotypeMatrix.
-#' @param clusterSizeVector A number that indicates the cluster complexity to by simulated
+#' To be passed to call_genotypes
+#' @param cluster_size_vector A number that indicates the cluster complexity to be simulated
 #' (i.e. the number of cells in the cluster)
 #' @param input the loaded dataset
 #' @param output_directory Directory to write the simulated input files for
 #' the CTC-SCITE run to.
+#' @param output_label The number of cells in the simulated cluster
 #' @param dropoutRate The dropout rate to assume for the simulation
 #' @param errorRate The error rate to assume for the simulation
 #' @param seed Set a seed for reproducibility
@@ -310,88 +649,115 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 1000, input) {
 #' @export
 #'
 #' @examples
-simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_directory, output_label, dropoutRate = 0.3, errorRate = 0.001, seed = 123, zeroInflated = TRUE) {
+simulateCTCclusters <- function(
+    samplingSize,
+    cluster_size_vector,
+    input,
+    output_directory,
+    output_label,
+    dropoutRate = 0.3,
+    errorRate = 0.001,
+    seed = 123,
+    zeroInflated = TRUE) {
   set.seed(seed)
-  color_palette <- list("orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple", "purple1", "purple2", "purple3", "purple4")
+  #  color_palette <-
+  #    list(
+  #      "orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid",
+  #      "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple",
+  #      "purple1", "purple2", "purple3", "purple4"
+  #      )
 
 
-  fit <- fitReadCountDistribution(input, zeroInfl = zeroInflated)
-
   print("Calling genotypes")
-  # Output data frame in long format. This is essentially a cell x mutation genotype matrix.
-  # This represents to pool of genotypes from which I can now sample for the simulation.
-
-  genotypes <- getGenotypeMatrix(nTreeSamplingEvents = samplingSize, input = input)
+  # Output data frame in long format.
+  # This is essentially a cell x mutation genotype matrix.
+  # This represents to pool of genotypes from which
+  # I can now sample for the simulation.
 
+  genotypes <-
+    call_genotypes(nTreeSamplingEvents = samplingSize, input = input)
 
-  genotypesOutputFormat <- data.frame(matrix(0, nrow = input$nMutations, ncol = 0))
-  sampleDescriptionOutputFormat <- data.frame(matrix(0, nrow = 0, ncol = 5))
-  colnames(sampleDescriptionOutputFormat) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description")
+  fit <- fitReadCountDistribution(input, zeroInfl = zeroInflated)
 
-  ## Sample as many genotypes as there should be simulated clusters.
-  cellIDs <- paste0("X", 1:nrow(input$sample_description))
 
 
+  cells <-
+    sample_genotypes(
+      input = input,
+      genotypes = genotypes,
+      sampling_size = sum(cluster_size_vector)
+    )
 
-  if (sum(clusterSizeVector) > length(unique(genotypes$Sample))) {
-    cells <- sample(size = length(unique(genotypes$Sample)), x = cellIDs, replace = FALSE)
-    stop("You want to sample more genotypes than can be provided")
-  } else {
-    cells <- sample(size = sum(clusterSizeVector), x = cellIDs, replace = FALSE)
-  }
+  genotypes_output_format <-
+    data.frame(matrix(0, nrow = input$nMutations, ncol = 0))
+  sample_description_output_format <- data.frame(matrix(0, nrow = 0, ncol = 5))
+  colnames(sample_description_output_format) <-
+    c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description")
 
 
   iterator <- 0
-
   # iterating over the size of the clusters to be simulated
-  for (clusterSize in 1:length(clusterSizeVector)) {
-    # iterating over the number of clusters of the same size to be simulated. Here not a for loop, to avoid backwards counting in R.
+  for (size_of_cluster in 1:length(cluster_size_vector)) {
+    # iterating over the number of clusters of the same size to be simulated.
+    # Here not a for loop, to avoid backwards counting in R.
     clustersBySize <- 1
-    while (clustersBySize <= clusterSizeVector[clusterSize]) {
+    while (clustersBySize <= cluster_size_vector[size_of_cluster]) {
       print(paste("Simulating CTC cluster ", iterator))
-      print(paste("Number of cells: ", clusterSize))
+      print(paste("Number of cells: ", size_of_cluster))
       genotype <- genotypes %>%
         filter(Sample == cells[clustersBySize]) %>%
         arrange(Mutation)
 
       genotype <- pull(genotype, Genotype)
 
-      nMutatedAlleles <- clusterSize * genotype
-      nAllelesTotal <- clusterSize * rep(2, length(genotype))
+      nMutatedAlleles <- size_of_cluster * genotype
+      nAllelesTotal <- size_of_cluster * rep(2, length(genotype))
       nWildtypeAlleles <- nAllelesTotal - nMutatedAlleles
-      data <- data.frame(nWildtypeAlleles = nWildtypeAlleles, nMutatedAlleles = nMutatedAlleles)
+      data <- data.frame(
+        nWildtypeAlleles = nWildtypeAlleles, nMutatedAlleles = nMutatedAlleles
+      )
       print("Starting simulation of read counts")
       reads <- apply(data, FUN = function(x) {
-        return(simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts)
+        return(
+          simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts
+        )
       }, MARGIN = 1) %>% t()
-      genotypesOutputFormat <- cbind(genotypesOutputFormat, reads)
+      genotypes_output_format <- cbind(genotypes_output_format, reads)
       print("Done")
 
       newSample <- data.frame(
         sample_name = paste0(input$sampleName, "_sim", iterator),
-        total_number_cells = clusterSize, tumor_cells = clusterSize,
+        total_number_cells = size_of_cluster, tumor_cells = size_of_cluster,
         WBCs = 0,
         description =
-          paste0("[color=", color_palette[[iterator + 1]], ',label="', input$sampleName, "_sim", iterator, '",fillcolor=', color_palette[[iterator + 1]], ',image="../CTC-cluster-icons/cluster_', clusterSize, '-0.png"]')
+          paste0(
+            "[color=", color_palette[[iterator + 1]],
+            ',label="', input$sampleName, "_sim",
+            iterator,
+            '",fillcolor=',
+            color_palette[[iterator + 1]],
+            ',image="../CTC-cluster-icons/cluster_',
+            size_of_cluster,
+            '-0.png"]'
+          )
       )
-      sampleDescriptionOutputFormat <- rbind(sampleDescriptionOutputFormat, newSample)
+      sample_description_output_format <-
+        rbind(sample_description_output_format, newSample)
 
       iterator <- iterator + 1
       clustersBySize <- clustersBySize + 1
     }
-  }
-  print("Writing output files")
-
 
-  dir.create(file.path(output_directory, paste(input$sampleName, output_label, sep = "_")), recursive = TRUE)
-  description_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none")
-  colnames(description_data) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description")
-  description_data <- rbind(description_data, sampleDescriptionOutputFormat)
-  write_delim(x = description_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none", escape = "none")
-
-  read_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, ".txt")), delim = "\t", col_names = FALSE, escape_backslash = TRUE)
-  read_data <- cbind(read_data, genotypesOutputFormat)
-  write_delim(x = read_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, ".txt")), delim = "\t", col_names = FALSE, quote = "none", escape = "none")
+    if (cluster_size_vector[size_of_cluster] > 0) {
+      create_simulated_output(
+        output_directory,
+        input,
+        size_of_cluster,
+        sample_description_output_format,
+        genotypes_output_format
+      )
+    }
+  }
 }
 
 
@@ -399,19 +765,40 @@ simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_d
 
 
 # for (tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")) {}
-clusterSizeVector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2)
 
-keep <- rep(0, length(clusterSizeVector))
-keep[clusterSize] <- 1
-clusterSizeVector[keep == 0] <- 0
 
+cluster_size_vector <- c(0, 4, 3, 2, 2, 2, 2, 2, 2)
 
 
-print(paste("Running simulation for", tree))
+print(paste("Running simulation for", tree_name))
 
-simulateCTCclusters(
-  samplingSize = 100, clusterSizeVector = clusterSizeVector, input,
-  output_directory = outputFolder, output_label = output_label,
-  dropoutRate = 0.35, errorRate = 0.0015, seed = 124,
-  zeroInflated = TRUE
-)
+all_cluster_sizes <- input$sample_description %>%
+  filter(WBC == 0 & color != "gray93") %>%
+  group_by(color) %>%
+  filter(n() > 1) %>%
+  summarize(cluster_size = n()) %>%
+  dplyr::select("cluster_size") %>%
+  unique()
+
+
+if (monoclonal == TRUE) {
+  keep <- rep(0, length(cluster_size_vector))
+  keep[cluster_size] <- 1
+  cluster_size_vector[keep == 0] <- 0
+  print("Simulating monoclonal clusters.")
+  simulateCTCclusters(
+    samplingSize = 100, cluster_size_vector = cluster_size_vector, input,
+    output_directory = output_folder, output_label = output_label,
+    dropoutRate = 0.35, errorRate = 0.0015, seed = 124,
+    zeroInflated = TRUE
+  )
+} else {
+  for (idx in 1:nrow(all_cluster_sizes)) {
+    cluster_size <- all_cluster_sizes$cluster_size[idx]
+    print("Simulating oligoclonal clusters.")
+    for (idx2 in 1:cluster_size_vector[cluster_size]) {
+      simulate_oligoclonals(input, output_folder, cluster_size, sampling_size = 100)
+    }
+    print("Success.")
+  }
+}