-
Notifications
You must be signed in to change notification settings - Fork 0
/
aaces_rnaseq_pipeline.sh
executable file
·153 lines (131 loc) · 5.62 KB
/
aaces_rnaseq_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/bin/bash
exec &> aaces_rnaseq_analysis.out
# Amy Campbell, 2017
# This shell script performs differential clustering analyses
# from Way et al's 'Cross-population analysis of high-grade serous ovarian
# cancer does not support four subtypes'
# This script assumes you have activated the conda hgsc_subtypes environment
# encoded in environment.yml
############################################
# INSTALL DEPENDENCIES
#Rscript INSTALL.R
#################
# PART ZERO:
# Download Mayo data
#################
# COMBAT adjust Mayo data
echo "RUNNING: COMBAT adjust Mayo data"
#Rscript 1.DataInclusion/Scripts/processMayoEset/\
#Agilent1and2and3_COMBAT_datamerge.R
echo "FINISHED: COMBAT adjust Mayo data"
# Create an eset from the Mayo data
echo "RUNNING: Create an eset from the Mayo data"
#Rscript 1.DataInclusion/Scripts/processMayoEset/createMayoEset.R
echo "FINISHED: Create an eset from the Mayo data"
# Define Constants
DATASETS="aaces.rnaseq.eset aaces.white.rnaseq.eset TCGA_eset mayo.eset GSE32062.GPL6480_eset GSE9891_eset aaces.eset"
KMIN=2
KMAX=4
SEED=123
NSTARTS=100
NMFSTARTS=2 # 10
NO_SHUFFLE=FALSE
SHUFFLE=TRUE
SAM_SUBSET='commongenes'
AACES_PATH='1.DataInclusion/Data/AACES/aaces_expression.tsv'
AACES_RNASEQ_PATH='1.DataInclusion/Data/AACES/salmon_normalized_filtered_for_way_pipeline_bottom10Removed.tsv'
AACES_WHITE_RNASEQ_PATH='1.DataInclusion/Data/AACES/salmon_normalized_filtered_for_way_pipeline_bottom5Removed_whites.tsv'
#################
# PART ONE:
# Dataset Selection and Inclusion
#################
# ~~~~~~~~~~~~~~~~~~~~~
# This section will determine which samples meet a specific inclusion criteria
# for use in downstream analyses
# ~~~~~~~~~~~~~~~~~~~~~
#
# We are using data from curatedOvarianData version 1.8.0
# NOTE: The Mayo Clinic Data is not currently in curatedOvarianData.
# Output the samples for each dataset that pass the inclusion criteria
echo "RUNNING: Dataset Selection and Inclusion step 1"
Rscript 1.DataInclusion/Scripts/A.getInclusion.R --aaces $AACES_PATH --aaces_rna $AACES_RNASEQ_PATH --aaces_rna_white $AACES_WHITE_RNASEQ_PATH
echo "FINISHED: Dataset Selection and Inclusion step 1"
# Output the common genes and the MAD (Median Absolute Deviation) genes to be
# used in developing moderated t score vectors and in clustering, respectively.
echo "RUNNING: Dataset Selection and Inclusion step 2"
Rscript 1.DataInclusion/Scripts/B.getGenes.R $DATASETS #"GSE26712_eset"
echo "FINISHED: Dataset Selection and Inclusion step 2"
#################
# PART TWO:
# Run k means and SAM
#################
# ~~~~~~~~~~~~~~~~~~~~~
# The scripts will take as inputs the samples and genes from the previous
# section that passed the inclusion criteria. It will also run k means for
# k min - k max, output several figures (moderated t score heatmaps, kmeans bar
# chart distributions, correlation matrices) and tables (cluster membership by
# dataset, within dataset cluster correlations)
# ~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~
# SAM with MAD genes
# ~~~~~~~~~~~~~
# Output across dataset correlations for MAD genes
# NOTE: common genes used in downstream analyses
echo "RUNNING: SAM with MAD genes"
Rscript 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE \
"madgenes" $DATASETS #"GSE26712_eset"
echo "FINISHED: SAM with MAD genes"
# ~~~~~~~~~~~~~
# k means & SAM (with common genes)
# ~~~~~~~~~~~~~
# Perform k means and SAM
echo "RUNNING: k means & SAM step 1"
Rscript 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R $KMIN $KMAX $NSTARTS $SEED FALSE $NO_SHUFFLE $SAM_SUBSET \
$DATASETS #"GSE26712_eset"
echo "FINISHED: k means & SAM step 1"
# Output correlation matrices
echo "RUNNING: k means & SAM step 2"
Rscript 2.Clustering_DiffExprs/Scripts/B.CorrelationMatrix.R $KMIN $KMAX $SEED Figures/CorrelationMatrix/ $DATASETS #"GSE26712_eset"
echo "FINISHED: k means & SAM step 2"
# Output k-means barcharts
echo "RUNNING: k means & SAM step 3"
Rscript 2.Clustering_DiffExprs/Scripts/C.KMeansBarCharts.R $KMIN $KMAX $DATASETS
echo "FINISHED: k means & SAM step 3"
# Shuffle genes to compare across population correlations in real data
echo "RUNNING: k means & SAM step 4"
Rscript 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R $KMIN $KMAX $NSTARTS $SEED FALSE $SHUFFLE $SAM_SUBSET \
$DATASETS #"GSE26712_eset"
echo "FINISHED: k means & SAM step 4"
# ~~~~~~~~~~~~~
# NMF
# ~~~~~~~~~~~~~
# Output consensus matrices, NMF cluster membership files and
# cophenetic coefficients
echo "RUNNING: NMF step 1"
Rscript 2.Clustering_DiffExprs/Scripts/D.NMF.R $KMIN $KMAX $NMFSTARTS $SEED $DATASETS #"GSE26712_eset"
echo "FINISHED: NMF step 1"
# Run SAM on NMF clusters (TRUE argument forces NMF analysis)
echo "RUNNING: NMF step 2"
Rscript 2.Clustering_DiffExprs/Scripts/A.run_kmeans_SAM.R $KMIN $KMAX $NMFSTARTS $SEED TRUE $NO_SHUFFLE $SAM_SUBSET \
$DATASETS #"GSE26712_eset"
echo "FINISHED: NMF step 2"
# ~~~~~~~~~~~~~
# k means vs. NMF
# ~~~~~~~~~~~~~
# Compare k-means defined clusters with NMF defined clusters
echo "RUNNING: Kmeans vs. NMF step 1"
Rscript 2.Clustering_DiffExprs/Scripts/E.kmeans_v_nmf.R $DATASETS
echo "FINISHED: Kmeans vs. NMF step 1"
# Compile table with all cluster membership information
echo "RUNNING: Kmeans vs. NMF step 2"
Rscript 2.Clustering_DiffExprs/Scripts/F.clusterMembership.R $DATASETS
echo "FINISHED: Kmeans vs. NMF step 2"
# ~~~~~~~~~~~~~
# Dataset Concordance
# ~~~~~~~~~~~~~
# Investigate the similarities in cluster membership in original TCGA 2011
# paper, the Konecny 2014 paper, and the Tothill 2008 paper (Table 4)
echo "RUNNING: Dataset Concordance"
R --no-save < 2.Clustering_DiffExprs/Scripts/G.Dataset_concordance.R
echo "FINISHED: Dataset Concordance"
echo "PIPELINE COMPLETE"