Skip to content

Commit

Permalink
Merge pull request #10 from d3b-center/update-modules
Browse files Browse the repository at this point in the history
Fix TIS labels, update WHO annotation labels, update GSVA call
  • Loading branch information
aadamk authored Oct 16, 2024
2 parents 9b92684 + 45c8d73 commit e8903ae
Show file tree
Hide file tree
Showing 52 changed files with 23,018 additions and 23,089 deletions.
84 changes: 30 additions & 54 deletions analyses/lgg_xcell_analyses/00-run_xcell.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,18 @@ suppressPackageStartupMessages({

# parse arguments
option_list <- list(
make_option(c("--mat"), type = "character", help = "input matrix for immune profiling (.rds)"),
make_option(c("--histology"), type = "character", help = "histology file for all OpenPedCan samples (.tsv)"),
make_option(
c("--short_hist"),
type = "character",
default = NULL,
help = "short histology filter"
),
make_option(
c("--broad_hist"),
type = "character",
default = NULL,
help = "broad histology filter"
),
make_option(
c("--cg"),
type = "character",
default = NULL,
help = "cancer group filter"
),
make_option(c("--output_dir"), type = "character", help = "output directory")
make_option(c("--mat"), type = "character",
help = "input matrix for immune profiling (.rds)"),
make_option(c("--histology"), type = "character",
help = "histology file for all OpenPedCan samples (.tsv)"),
make_option(c("--short_hist"), type = "character", default = NULL,
help = "short histology filter"),
make_option(c("--broad_hist"), type = "character", default = NULL,
help = "broad histology filter"),
make_option(c("--cg"), type = "character", default = NULL,
help = "cancer group filter"),
make_option(c("--output_dir"), type = "character",
help = "output directory")
)
opt <- parse_args(OptionParser(option_list = option_list, add_help_option = TRUE))
mat <- opt$mat
Expand All @@ -41,7 +32,7 @@ output_dir <- opt$output_dir

# directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "lgg_xcell_analyses")
analysis_dir <- file.path(root_dir, "analyses", "lgg_xcell_analyses")
dir.create(output_dir, recursive = TRUE, showWarnings = F)

# histology file
Expand All @@ -50,14 +41,14 @@ histology_df <- readr::read_tsv(histology) %>%

# subset to histology of interest
histology_df <- histology_df %>%
filter(
sample_type == "Tumor",
composition != "Derived Cell Line",!cohort %in% c("TCGA", "GTEx"),
experimental_strategy == "RNA-Seq",!is.na(molecular_subtype)
) %>%
dplyr::filter(short_histology %in% short_hist |
broad_histology %in% broad_hist |
cancer_group %in% cg)
filter(sample_type == "Tumor",
composition != "Derived Cell Line",
!cohort %in% c("TCGA", "GTEx"),
experimental_strategy == "RNA-Seq",
!is.na(molecular_subtype)) %>%
dplyr::filter(short_histology %in% short_hist |
broad_histology %in% broad_hist |
cancer_group %in% cg)

# read input matrix file and filter
mat <- readRDS(mat)
Expand All @@ -68,40 +59,25 @@ histology_df <- histology_df %>%
print(dim(mat))

# compute xcell scores
deconv_output <- deconvolute(gene_expression = as.matrix(mat),
method = "xcell",
deconv_output <- deconvolute(gene_expression = as.matrix(mat),
method = "xcell",
arrays = F)

# remove cell types and convert to matrix
deconv_output <- deconv_output %>%
dplyr::filter(
!cell_type %in% c(
"Smooth muscle",
"Osteoblast",
"Myocytes",
"Platelets",
"Adipocytes",
"MPP",
"Hepatocytes",
"Skeletal muscle",
"Keratinocytes",
"Sebocytes",
"Erythrocytes",
"Epithelial cells",
"Chondrocytes",
"ly Endothelial cells",
"Endothelial cells",
"Neurons"
)
) %>%
column_to_rownames("cell_type")
dplyr::filter(!cell_type %in% c("Smooth muscle", "Osteoblast", "Myocytes",
"Platelets", "Adipocytes", "MPP", "Hepatocytes",
"Skeletal muscle", "Keratinocytes", "Sebocytes",
"Erythrocytes", "Epithelial cells", "Chondrocytes",
"ly Endothelial cells", "Endothelial cells", "Neurons")) %>%
column_to_rownames("cell_type")

# # convert to long format
# deconv_output <- deconv_output %>%
# as.data.frame() %>%
# gather(Kids_First_Biospecimen_ID, fraction, -c(cell_type)) %>%
# as.data.frame()
#
#
# # merge output with clinical data
# full_output <- histology_df %>%
# inner_join(deconv_output, by = "Kids_First_Biospecimen_ID") %>%
Expand Down
183 changes: 82 additions & 101 deletions analyses/lgg_xcell_analyses/01-xcell_clustering.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Perform consensus clustering on expected count of disease of interest
# Perform consensus clustering on expected count of disease of interest
suppressPackageStartupMessages({
library(optparse)
library(tidyverse)
Expand All @@ -13,11 +13,16 @@ suppressPackageStartupMessages({

# parse arguments
option_list <- list(
make_option(c("--mat"), type = "character", help = "input matrix (.rds)"),
make_option(c("--histology"), type = "character", help = "histology file (.tsv)"),
make_option(c("--output_dir"), type = "character", help = "output directory for clustering analyses"),
make_option(c("--xcell_output_dir"), type = "character", help = "output directory for xcell analyses"),
make_option(c("--vtest_output_dir"), type = "character", help = "output directory for v-test analyses")
make_option(c("--mat"), type = "character",
help = "input matrix (.rds)"),
make_option(c("--histology"), type = "character",
help = "histology file (.tsv)"),
make_option(c("--output_dir"), type = "character",
help = "output directory for clustering analyses"),
make_option(c("--xcell_output_dir"), type = "character",
help = "output directory for xcell analyses"),
make_option(c("--vtest_output_dir"), type = "character",
help = "output directory for v-test analyses")
)
opt <- parse_args(OptionParser(option_list = option_list, add_help_option = TRUE))
mat <- opt$mat
Expand All @@ -27,13 +32,9 @@ histology <- opt$histology
output_dir <- opt$output_dir
dir.create(output_dir, recursive = TRUE, showWarnings = F)
xcell_output_dir <- opt$xcell_output_dir
dir.create(xcell_output_dir,
recursive = TRUE,
showWarnings = F)
dir.create(xcell_output_dir, recursive = TRUE, showWarnings = F)
vtest_output_dir <- opt$vtest_output_dir
dir.create(vtest_output_dir,
recursive = TRUE,
showWarnings = F)
dir.create(vtest_output_dir, recursive = TRUE, showWarnings = F)

# source functions from ClusTarIDseq
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
Expand All @@ -49,6 +50,7 @@ source(file.path(ClusTarIDseq_scripts, "final_composite_score.R"))
source(file.path(ClusTarIDseq_scripts, "perform_diptest.R"))
source(file.path(ClusTarIDseq_scripts, "compute_vtest.R"))


# set default parameters for xcell score-clustering
transformation_type_val = "none"
feature_selection_val = "variance"
Expand All @@ -58,109 +60,94 @@ minpts_val = 20

# run ccp + lspline clustering
ccp_outfile <- file.path(output_dir, "ccp_output", "ccp_optimal_clusters.tsv")
if (!file.exists(ccp_outfile)) {
lspline_clustering(
expr_mat = mat,
hist_file = histology,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
max_k = max_k,
coef_cutoff = 0.1,
min_cluster_size_prop = 0.1,
max_cluster_size_prop = 0.5,
compute_all_equal = TRUE,
output_dir = file.path(output_dir, "ccp_output")
)
if(!file.exists(ccp_outfile)){
lspline_clustering(expr_mat = mat,
hist_file = histology,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
max_k = max_k,
coef_cutoff = 0.1,
min_cluster_size_prop = 0.1, max_cluster_size_prop = 0.5,
compute_all_equal = TRUE,
output_dir = file.path(output_dir, "ccp_output"))
}

# run dbscan
hdbscan_outfile <- file.path(output_dir, "hdbscan_output", "hdbscan_optimal_clusters.tsv")
if (!file.exists(hdbscan_outfile)) {
hdbscan_clustering(
expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
minpts_val = minpts_val,
output_dir = file.path(output_dir, "hdbscan_output")
)
if(!file.exists(hdbscan_outfile)){
hdbscan_clustering(expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
minpts_val = minpts_val,
output_dir = file.path(output_dir, "hdbscan_output"))
}

# run intNMF
intnmf_outfile <- file.path(output_dir, "intnmf_output", "intnmf_optimal_clusters.tsv")
if (!file.exists(intnmf_outfile)) {
intnmf_clustering(
expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
max_k = max_k,
output_dir = file.path(output_dir, "intnmf_output")
)
if(!file.exists(intnmf_outfile)){
intnmf_clustering(expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
max_k = max_k,
output_dir = file.path(output_dir, "intnmf_output"))
}

# run mClust
mclust_outfile <- file.path(output_dir, "mclust_output", "mclust_optimal_clusters.tsv")
if (!file.exists(mclust_outfile)) {
mclust_clustering(
expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
output_dir = file.path(output_dir, "mclust_output")
)
if(!file.exists(mclust_outfile)){
mclust_clustering(expr_mat = mat,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
min_n = NULL,
transformation_type = transformation_type_val,
output_dir = file.path(output_dir, "mclust_output"))
}

# final composite score for each clustering method
final_composite_score(
expr_mat = mat,
hist_file = histology,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
transformation_type = transformation_type_val,
ccp_output = ccp_outfile,
nbmclust_output = NULL,
mclust_output = mclust_outfile,
hdbscan_output = hdbscan_outfile,
intnmf_output = intnmf_outfile,
output_dir = file.path(output_dir, "final_score")
)
final_composite_score(expr_mat = mat,
hist_file = histology,
filter_expr = FALSE,
protein_coding_only = FALSE,
feature_selection = feature_selection_val,
var_prop = var_prop,
transformation_type = transformation_type_val,
ccp_output = ccp_outfile,
nbmclust_output = NULL,
mclust_output = mclust_outfile,
hdbscan_output = hdbscan_outfile,
intnmf_output = intnmf_outfile,
output_dir = file.path(output_dir, "final_score"))

# read in final composite score file and pull top ranking result
clustering_method <- file.path(output_dir, "final_score", "final_clustering_output.tsv") %>%
read_tsv() %>%
filter(rank == 1) %>%
pull(clustering_method)
if (clustering_method == "IntNMF") {
if(clustering_method == "IntNMF"){
final_output <- read_tsv(file.path(output_dir, "intnmf_output", "intnmf_optimal_clusters.tsv"))
} else if (clustering_method == "hdbscan") {
final_output <- read_tsv(file.path(
output_dir,
"hdbscan_output",
"hdbscan_optimal_clusters.tsv"
))
} else if (clustering_method == "mclust") {
} else if(clustering_method == "hdbscan") {
final_output <- read_tsv(file.path(output_dir, "hdbscan_output", "hdbscan_optimal_clusters.tsv"))
} else if(clustering_method == "mclust") {
final_output <- read_tsv(file.path(output_dir, "mclust_output", "mclust_optimal_clusters.tsv"))
} else {
final_output <- read_tsv(file.path(output_dir, "ccp_output", "ccp_optimal_clusters.tsv"))
}

# compute v test
# compute v test
xcell_score <- readRDS(mat)
xcell_score <- xcell_score %>%
rownames_to_column("cell_type") %>%
Expand All @@ -169,20 +156,17 @@ xcell_score <- xcell_score %>%
xcell_score <- xcell_score %>%
dplyr::left_join(final_output, by = c("Kids_First_Biospecimen_ID" = "sample")) %>%
dplyr::group_by(cluster_assigned, cell_type) %>%
dplyr::mutate(cluster_feature_mean_score = mean(fraction)) %>% # mean of gene count per cluster
dplyr::mutate(cluster_feature_mean_score = mean(fraction)) %>% # mean of gene count per cluster
ungroup() %>%
dplyr::group_by(cell_type) %>%
dplyr::mutate(feature_mean_score = mean(fraction),
feature_variance = var(fraction))
feature_variance = var(fraction))

# combine with histology molecular subtype
histology_df <- read_tsv(histology)
xcell_score <- histology_df %>%
dplyr::select(Kids_First_Biospecimen_ID,
sample_id,
cohort_participant_id,
molecular_subtype) %>%
inner_join(xcell_score)
dplyr::select(Kids_First_Biospecimen_ID, sample_id, cohort_participant_id, molecular_subtype) %>%
inner_join(xcell_score)

# reduce molecular subtypes by collapsing to larger groups
xcell_score$molecular_subtype <- as.character(xcell_score$molecular_subtype)
Expand Down Expand Up @@ -212,12 +196,9 @@ xcell_score %>%
write_tsv(file.path(xcell_output_dir, "xcell_score_cluster.tsv"))

# apply v.test function per gene
vtest_output_all <- plyr::ddply(
.data = xcell_score,
.variables = "cell_type",
.fun = function(x)
compute_vtest(x, clustering_col = "cluster_assigned")
)
vtest_output_all <- plyr::ddply(.data = xcell_score,
.variables = "cell_type",
.fun = function(x) compute_vtest(x, clustering_col = "cluster_assigned"))
# output vtest score
vtest_output_all <- vtest_output_all %>%
readr::write_tsv(file.path(vtest_output_dir, "vtest_scores_all.tsv"))
Loading

0 comments on commit e8903ae

Please sign in to comment.