Merge pull request #10 from d3b-center/update-modules

Fix TIS labels, update WHO annotation labels, update GSVA call
d3b-center · Oct 16, 2024 · e8903ae · e8903ae
2 parents 9b92684 + 45c8d73
commit e8903ae
Show file tree

Hide file tree

Showing 52 changed files with 23,018 additions and 23,089 deletions.
diff --git a/analyses/lgg_xcell_analyses/00-run_xcell.R b/analyses/lgg_xcell_analyses/00-run_xcell.R
@@ -9,27 +9,18 @@ suppressPackageStartupMessages({
 
 # parse arguments
 option_list <- list(
-  make_option(c("--mat"), type = "character", help = "input matrix for immune profiling (.rds)"),
-  make_option(c("--histology"), type = "character", help = "histology file for all OpenPedCan samples (.tsv)"),
-  make_option(
-    c("--short_hist"),
-    type = "character",
-    default = NULL,
-    help = "short histology filter"
-  ),
-  make_option(
-    c("--broad_hist"),
-    type = "character",
-    default = NULL,
-    help = "broad histology filter"
-  ),
-  make_option(
-    c("--cg"),
-    type = "character",
-    default = NULL,
-    help = "cancer group filter"
-  ),
-  make_option(c("--output_dir"), type = "character", help = "output directory")
+  make_option(c("--mat"), type = "character",
+              help = "input matrix for immune profiling (.rds)"),
+  make_option(c("--histology"), type = "character",
+              help = "histology file for all OpenPedCan samples (.tsv)"),
+  make_option(c("--short_hist"), type = "character", default = NULL,
+              help = "short histology filter"),
+  make_option(c("--broad_hist"), type = "character", default = NULL,
+              help = "broad histology filter"),
+  make_option(c("--cg"), type = "character", default = NULL,
+              help = "cancer group filter"),
+  make_option(c("--output_dir"), type = "character",
+              help = "output directory")
 )
 opt <- parse_args(OptionParser(option_list = option_list, add_help_option = TRUE))
 mat <- opt$mat
@@ -41,7 +32,7 @@ output_dir <- opt$output_dir
 
 # directories
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
-analysis_dir <- file.path(root_dir, "lgg_xcell_analyses")
+analysis_dir <- file.path(root_dir, "analyses", "lgg_xcell_analyses")
 dir.create(output_dir, recursive = TRUE, showWarnings = F)
 
 # histology file
@@ -50,14 +41,14 @@ histology_df <- readr::read_tsv(histology) %>%
 
 # subset to histology of interest
 histology_df <- histology_df %>%
-  filter(
-    sample_type == "Tumor",
-    composition != "Derived Cell Line",!cohort %in% c("TCGA", "GTEx"),
-    experimental_strategy == "RNA-Seq",!is.na(molecular_subtype)
-  ) %>%
-  dplyr::filter(short_histology %in% short_hist |
-                  broad_histology %in% broad_hist |
-                  cancer_group %in% cg)
+  filter(sample_type == "Tumor",
+         composition != "Derived Cell Line", 
+         !cohort %in% c("TCGA", "GTEx"),
+         experimental_strategy == "RNA-Seq",
+         !is.na(molecular_subtype)) %>%
+  dplyr::filter(short_histology %in% short_hist | 
+                  broad_histology %in% broad_hist | 
+                  cancer_group %in% cg) 
 
 # read input matrix file and filter
 mat <- readRDS(mat)
@@ -68,40 +59,25 @@ histology_df <- histology_df %>%
 print(dim(mat))
 
 # compute xcell scores
-deconv_output <- deconvolute(gene_expression = as.matrix(mat),
-                             method = "xcell",
+deconv_output <- deconvolute(gene_expression = as.matrix(mat), 
+                             method = "xcell", 
                              arrays = F)
 
 # remove cell types and convert to matrix
 deconv_output <- deconv_output %>%
-  dplyr::filter(
-    !cell_type %in% c(
-      "Smooth muscle",
-      "Osteoblast",
-      "Myocytes",
-      "Platelets",
-      "Adipocytes",
-      "MPP",
-      "Hepatocytes",
-      "Skeletal muscle",
-      "Keratinocytes",
-      "Sebocytes",
-      "Erythrocytes",
-      "Epithelial cells",
-      "Chondrocytes",
-      "ly Endothelial cells",
-      "Endothelial cells",
-      "Neurons"
-    )
-  ) %>%
-  column_to_rownames("cell_type")
+  dplyr::filter(!cell_type %in% c("Smooth muscle", "Osteoblast", "Myocytes", 
+                                  "Platelets", "Adipocytes", "MPP", "Hepatocytes", 
+                                  "Skeletal muscle", "Keratinocytes", "Sebocytes", 
+                                  "Erythrocytes", "Epithelial cells", "Chondrocytes", 
+                                  "ly Endothelial cells", "Endothelial cells", "Neurons")) %>%
+  column_to_rownames("cell_type") 
 
 # # convert to long format
 # deconv_output <- deconv_output %>%
 #   as.data.frame() %>%
 #   gather(Kids_First_Biospecimen_ID, fraction, -c(cell_type)) %>%
 #   as.data.frame()
-#
+# 
 # # merge output with clinical data
 # full_output <- histology_df %>%
 #   inner_join(deconv_output, by = "Kids_First_Biospecimen_ID") %>%

diff --git a/analyses/lgg_xcell_analyses/01-xcell_clustering.R b/analyses/lgg_xcell_analyses/01-xcell_clustering.R
@@ -1,4 +1,4 @@
-# Perform consensus clustering on expected count of disease of interest
+# Perform consensus clustering on expected count of disease of interest 
 suppressPackageStartupMessages({
   library(optparse)
   library(tidyverse)
@@ -13,11 +13,16 @@ suppressPackageStartupMessages({
 
 # parse arguments
 option_list <- list(
-  make_option(c("--mat"), type = "character", help = "input matrix (.rds)"),
-  make_option(c("--histology"), type = "character", help = "histology file (.tsv)"),
-  make_option(c("--output_dir"), type = "character", help = "output directory for clustering analyses"),
-  make_option(c("--xcell_output_dir"), type = "character", help = "output directory for xcell analyses"),
-  make_option(c("--vtest_output_dir"), type = "character", help = "output directory for v-test analyses")
+  make_option(c("--mat"), type = "character",
+              help = "input matrix (.rds)"),
+  make_option(c("--histology"), type = "character",
+              help = "histology file (.tsv)"),
+  make_option(c("--output_dir"), type = "character",
+              help = "output directory for clustering analyses"),
+  make_option(c("--xcell_output_dir"), type = "character",
+              help = "output directory for xcell analyses"),
+  make_option(c("--vtest_output_dir"), type = "character",
+              help = "output directory for v-test analyses")
 )
 opt <- parse_args(OptionParser(option_list = option_list, add_help_option = TRUE))
 mat <- opt$mat
@@ -27,13 +32,9 @@ histology <- opt$histology
 output_dir <- opt$output_dir
 dir.create(output_dir, recursive = TRUE, showWarnings = F)
 xcell_output_dir <- opt$xcell_output_dir
-dir.create(xcell_output_dir,
-           recursive = TRUE,
-           showWarnings = F)
+dir.create(xcell_output_dir, recursive = TRUE, showWarnings = F)
 vtest_output_dir <- opt$vtest_output_dir
-dir.create(vtest_output_dir,
-           recursive = TRUE,
-           showWarnings = F)
+dir.create(vtest_output_dir, recursive = TRUE, showWarnings = F)
 
 # source functions from ClusTarIDseq
 root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
@@ -49,6 +50,7 @@ source(file.path(ClusTarIDseq_scripts, "final_composite_score.R"))
 source(file.path(ClusTarIDseq_scripts, "perform_diptest.R"))
 source(file.path(ClusTarIDseq_scripts, "compute_vtest.R"))
 
+
 # set default parameters for xcell score-clustering
 transformation_type_val = "none"
 feature_selection_val = "variance"
@@ -58,109 +60,94 @@ minpts_val = 20
 
 # run ccp + lspline clustering
 ccp_outfile <- file.path(output_dir, "ccp_output", "ccp_optimal_clusters.tsv")
-if (!file.exists(ccp_outfile)) {
-  lspline_clustering(
-    expr_mat = mat,
-    hist_file = histology,
-    filter_expr = FALSE,
-    protein_coding_only = FALSE,
-    feature_selection = feature_selection_val,
-    var_prop = var_prop,
-    min_n = NULL,
-    transformation_type = transformation_type_val,
-    max_k = max_k,
-    coef_cutoff = 0.1,
-    min_cluster_size_prop = 0.1,
-    max_cluster_size_prop = 0.5,
-    compute_all_equal = TRUE,
-    output_dir = file.path(output_dir, "ccp_output")
-  )
+if(!file.exists(ccp_outfile)){
+  lspline_clustering(expr_mat = mat,
+                     hist_file = histology,
+                     filter_expr = FALSE,
+                     protein_coding_only = FALSE,
+                     feature_selection = feature_selection_val,
+                     var_prop = var_prop,
+                     min_n = NULL,
+                     transformation_type = transformation_type_val,
+                     max_k = max_k,
+                     coef_cutoff = 0.1,
+                     min_cluster_size_prop = 0.1, max_cluster_size_prop = 0.5,
+                     compute_all_equal = TRUE,
+                     output_dir = file.path(output_dir, "ccp_output"))
 }
 
 # run dbscan
 hdbscan_outfile <- file.path(output_dir, "hdbscan_output", "hdbscan_optimal_clusters.tsv")
-if (!file.exists(hdbscan_outfile)) {
-  hdbscan_clustering(
-    expr_mat = mat,
-    filter_expr = FALSE,
-    protein_coding_only = FALSE,
-    feature_selection = feature_selection_val,
-    var_prop = var_prop,
-    min_n = NULL,
-    transformation_type = transformation_type_val,
-    minpts_val = minpts_val,
-    output_dir = file.path(output_dir, "hdbscan_output")
-  )
+if(!file.exists(hdbscan_outfile)){
+  hdbscan_clustering(expr_mat = mat,
+                     filter_expr = FALSE,
+                     protein_coding_only = FALSE,
+                     feature_selection = feature_selection_val,
+                     var_prop = var_prop,
+                     min_n = NULL,
+                     transformation_type = transformation_type_val,
+                     minpts_val = minpts_val,
+                     output_dir = file.path(output_dir, "hdbscan_output"))
 }
 
 # run intNMF
 intnmf_outfile <- file.path(output_dir, "intnmf_output", "intnmf_optimal_clusters.tsv")
-if (!file.exists(intnmf_outfile)) {
-  intnmf_clustering(
-    expr_mat = mat,
-    filter_expr = FALSE,
-    protein_coding_only = FALSE,
-    feature_selection = feature_selection_val,
-    var_prop = var_prop,
-    min_n = NULL,
-    transformation_type = transformation_type_val,
-    max_k = max_k,
-    output_dir = file.path(output_dir, "intnmf_output")
-  )
+if(!file.exists(intnmf_outfile)){
+  intnmf_clustering(expr_mat = mat,
+                    filter_expr = FALSE,
+                    protein_coding_only = FALSE,
+                    feature_selection = feature_selection_val,
+                    var_prop = var_prop,
+                    min_n = NULL,
+                    transformation_type = transformation_type_val,
+                    max_k = max_k,
+                    output_dir = file.path(output_dir, "intnmf_output"))
 }
 
 # run mClust
 mclust_outfile <- file.path(output_dir, "mclust_output", "mclust_optimal_clusters.tsv")
-if (!file.exists(mclust_outfile)) {
-  mclust_clustering(
-    expr_mat = mat,
-    filter_expr = FALSE,
-    protein_coding_only = FALSE,
-    feature_selection = feature_selection_val,
-    var_prop = var_prop,
-    min_n = NULL,
-    transformation_type = transformation_type_val,
-    output_dir = file.path(output_dir, "mclust_output")
-  )
+if(!file.exists(mclust_outfile)){
+  mclust_clustering(expr_mat = mat, 
+                    filter_expr = FALSE,
+                    protein_coding_only = FALSE,
+                    feature_selection = feature_selection_val,
+                    var_prop = var_prop,
+                    min_n = NULL,
+                    transformation_type = transformation_type_val,
+                    output_dir = file.path(output_dir, "mclust_output"))
 }
 
 # final composite score for each clustering method
-final_composite_score(
-  expr_mat = mat,
-  hist_file = histology,
-  filter_expr = FALSE,
-  protein_coding_only = FALSE,
-  feature_selection = feature_selection_val,
-  var_prop = var_prop,
-  transformation_type = transformation_type_val,
-  ccp_output = ccp_outfile,
-  nbmclust_output = NULL,
-  mclust_output = mclust_outfile,
-  hdbscan_output = hdbscan_outfile,
-  intnmf_output = intnmf_outfile,
-  output_dir = file.path(output_dir, "final_score")
-)
+final_composite_score(expr_mat = mat,
+                      hist_file = histology,
+                      filter_expr = FALSE,
+                      protein_coding_only = FALSE,
+                      feature_selection = feature_selection_val,
+                      var_prop = var_prop,
+                      transformation_type = transformation_type_val,
+                      ccp_output = ccp_outfile,
+                      nbmclust_output = NULL,
+                      mclust_output = mclust_outfile,
+                      hdbscan_output = hdbscan_outfile,
+                      intnmf_output = intnmf_outfile,
+                      output_dir = file.path(output_dir, "final_score"))
 
 # read in final composite score file and pull top ranking result
 clustering_method <- file.path(output_dir, "final_score", "final_clustering_output.tsv") %>%
   read_tsv() %>%
   filter(rank == 1) %>%
   pull(clustering_method)
-if (clustering_method == "IntNMF") {
+if(clustering_method == "IntNMF"){
   final_output <- read_tsv(file.path(output_dir, "intnmf_output", "intnmf_optimal_clusters.tsv"))
-} else if (clustering_method == "hdbscan") {
-  final_output <- read_tsv(file.path(
-    output_dir,
-    "hdbscan_output",
-    "hdbscan_optimal_clusters.tsv"
-  ))
-} else if (clustering_method == "mclust") {
+} else if(clustering_method == "hdbscan") {
+  final_output <- read_tsv(file.path(output_dir, "hdbscan_output", "hdbscan_optimal_clusters.tsv"))
+} else if(clustering_method == "mclust") {
   final_output <- read_tsv(file.path(output_dir, "mclust_output", "mclust_optimal_clusters.tsv"))
 } else {
   final_output <- read_tsv(file.path(output_dir, "ccp_output", "ccp_optimal_clusters.tsv"))
 }
 
-# compute v test
+# compute v test 
 xcell_score <- readRDS(mat)
 xcell_score <- xcell_score %>%
   rownames_to_column("cell_type") %>%
@@ -169,20 +156,17 @@ xcell_score <- xcell_score %>%
 xcell_score <- xcell_score %>%
   dplyr::left_join(final_output, by = c("Kids_First_Biospecimen_ID" = "sample"))  %>%
   dplyr::group_by(cluster_assigned, cell_type) %>%
-  dplyr::mutate(cluster_feature_mean_score = mean(fraction)) %>% # mean of gene count per cluster
+  dplyr::mutate(cluster_feature_mean_score = mean(fraction)) %>% # mean of gene count per cluster 
   ungroup() %>%
   dplyr::group_by(cell_type) %>%
   dplyr::mutate(feature_mean_score = mean(fraction),
-                feature_variance = var(fraction))
+                feature_variance = var(fraction)) 
 
 # combine with histology molecular subtype
 histology_df <- read_tsv(histology)
 xcell_score <- histology_df %>%
-  dplyr::select(Kids_First_Biospecimen_ID,
-                sample_id,
-                cohort_participant_id,
-                molecular_subtype) %>%
-  inner_join(xcell_score)
+  dplyr::select(Kids_First_Biospecimen_ID, sample_id, cohort_participant_id, molecular_subtype) %>%
+  inner_join(xcell_score) 
 
 # reduce molecular subtypes by collapsing to larger groups
 xcell_score$molecular_subtype <- as.character(xcell_score$molecular_subtype)
@@ -212,12 +196,9 @@ xcell_score %>%
   write_tsv(file.path(xcell_output_dir, "xcell_score_cluster.tsv"))
 
 # apply v.test function per gene
-vtest_output_all <- plyr::ddply(
-  .data = xcell_score,
-  .variables = "cell_type",
-  .fun = function(x)
-    compute_vtest(x, clustering_col = "cluster_assigned")
-)
+vtest_output_all <- plyr::ddply(.data = xcell_score, 
+                                .variables = "cell_type", 
+                                .fun = function(x) compute_vtest(x, clustering_col = "cluster_assigned"))
 # output vtest score
 vtest_output_all <- vtest_output_all %>%
   readr::write_tsv(file.path(vtest_output_dir, "vtest_scores_all.tsv"))