-
Notifications
You must be signed in to change notification settings - Fork 0
/
script
411 lines (319 loc) · 17.9 KB
/
script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#!/bin/bash
#定义gz后缀+双末端后缀获取函数
function obtain_houzhui_name {
gz_houzhui=$(awk 'NR==1{print $1}' ./sample_information_file/gz_end_symbol.txt)
double_end1=$(awk 'NR==1{print $1}' ./sample_information_file/double_end_sequence_symbol.txt)
double_end2=$(awk 'NR==2{print $1}' ./sample_information_file/double_end_sequence_symbol.txt)
}
#定义创建各个步骤的文件夹函数
function judge_file {
if [ ! -d "$1" ]; then
mkdir $1
else
rm -r $1
mkdir $1
fi
}
#定义查看上游分析各个包是否存在的函数
function judge_packages {
which $1
if [ $? -eq 0 ];then
echo "======> 【" $1 '】 软件存在' ' True √'
echo ""
else
bash /data/changchuanjun/wechat_remind.sh "RNASeq上游分析各个包不完全存在,无法全部跑完,故退出!!!"
echo "==============> 【" $1 '】 软件不存在' ' Error Error Error × × ×'
exit 1
fi
}
#判断上一条命令是否正确
function judge {
if [ $? -eq 0 ]; then
# echo "succeed"
echo "True"
echo ""
else
echo ""
echo "命令错误!!! failed failed failed"
# echo "failed"
bash /data/changchuanjun/wechat_remind.sh "RNASeq上游分析,上一步命令错误,请检查脚本,程序已退出!!!"
exit 1
fi
}
#进行上游分析所有包是否都存在
function total_judge {
judge_packages fastqc
judge_packages multiqc
judge_packages fastp
judge_packages hisat2-build
judge_packages hisat2
judge_packages samtools
judge_packages Rscript
judge_packages perl
#judge_packages python3
#judge_packages R
}
#定义判断相关文件夹是否存在的函数
function judge_original_file {
if [ ! -d "$1" ]; then
echo $1 "文件夹不存在!!! failed"
bash /data/changchuanjun/wechat_remind.sh "RNASeq上游分析,程序所必须使用的文件夹或脚本目录不存在,程序停止!!!"
exit 1
else
echo $1 "文件夹存在,正在进行后续分析...... True"
fi
}
#定义判断相关文件是否存在的函数
function judge_document {
if [ ! -f "$1" ]; then
echo $1 "文件不存在!!!failed"
bash /data/changchuanjun/wechat_remind.sh "RNASeq上游分析,程序所必须使用的文件或脚本不存在,程序停止!!!"
exit 1
else
echo $1 "文件存在,正在进行后续分析...... True"
fi
}
#定义数据质控函数
function data_fastqc {
judge_file ./fastqc_result
echo ""
echo "开始测序文件的数据质控......"
for i in $(awk '{print $1}' ./sample_information_file/whole_sample_name.txt)
do
echo "正在进行"$i"文件的数据质控......"
fastqc -t 32 ./data/$i -o ./fastqc_result/
judge
echo $i"文件的数据质控已经完成!!!"
echo ""
done
awk '{$1= "所有文件的数据质控已经完成!!! 结果已存放在 " location "/fastqc_result/" " 目录下 " ; print$0}' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
#定义数据质控汇总函数
function fastqc_result_multiqc {
judge_file ./multiqc_result
echo ""
echo "开始测序文件数据质控报告的汇总......"
multiqc ./fastqc_result/ -o ./multiqc_result
judge
awk '{$1= "所有测序文件数据质控报告的汇总已经完成!!! 结果已存放在 " location "/multiqc_result/" " 目录下 " ; print$0}' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
#定义数据清洗函数fastp
function clean_data_fastp {
judge_file ./clean_data
obtain_houzhui_name
echo ""
echo "开始测序文件的数据清洗(去除接头和清洗低质量reads)......"
for i in $(awk '{print $1}' ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt)
do
echo "正在进行"$i$double_end1$gz_houzhui "&" $i$double_end2$gz_houzhui"文件的数据清洗(去除接头和清洗低质量reads)......"
fastp -w 32 \
-i ./data/${i}$double_end1$gz_houzhui \
-I ./data/${i}$double_end2$gz_houzhui \
-o ./clean_data/${i}_1_clean.fq.gz \
-O ./clean_data/${i}_2_clean.fq.gz \
-h ./clean_data/${i}.fastp.html \
-j ./clean_data/${i}.fastp.json
judge
echo $i$double_end1$gz_houzhui "&" $i$double_end2$gz_houzhui"文件的数据清洗(去除接头和清洗低质量reads)已经完成!!!"
# echo ""
done
awk '{$1= "所有文件的数据清洗(去除接头和清洗低质量reads)已经完成!!! 结果已存放在 " location "/clean_data/" " 目录下 " ; print$0}' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
cp /data/changchuanjun/fastp_count.py ./clean_data/
python ./clean_data/fastp_count.py ./clean_data
rm ./clean_data/*.html
rm ./clean_data/*.json
}
#构建参考基因组 ./ref/*.fasta是输入文件,./ref/genome是构建好的基因组的路径(即输出文件)
#定义比对到参考基因组函数bidui-refgenome
function construct_refgenome {
echo ""
echo "1、开始构建参考基因组......"
hisat2-build ./ref/ref_genome/* ./ref/genome 1>./ref/hisat2-build.log 2>&1
judge
awk '{$1= "构建参考基因组已经完成!!! 结果已存放在 " location "/ref" " 目录下 " ; print$0}' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
#定义判断参考基因组是否存在且数量符合要求的函数
function judge_refgenomefile_and_number {
refgenome_number=$(ls ./ref/ref_genome/* | wc -l)
if [ $refgenome_number -eq 1 ]; then
echo "参考基因组存在且唯一,正在进行后续分析...... True"
else
echo "参考基因组不符合要求,请重新检查!!! failed"
exit 1
fi
}
#定义判断参考基因组是否需要构建索引的函数
function refgenome_operation {
number_index=$(ls ./ref/*.ht2 | wc -l)
if [ $number_index -eq 8 ]; then
echo "参考基因组的索引文件完整,正在进行后续分析...... True"
else
echo "参考基因组的索引文件缺失!!!"
echo "正在重新构建参考基因组索引......"
judge_original_file ./ref/ref_genome
judge_refgenomefile_and_number
construct_refgenome
fi
}
#定义Hisat2比对函数
function hisat2_samtools {
judge_file ./1.Mapping
obtain_houzhui_name
mkdir ./1.Mapping/log
mkdir ./1.Mapping/sam
mkdir ./1.Mapping/bam
echo ""
echo "2、开始Hisat2-samtools!!!"
for i in $(awk '{print $1}' ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt)
do
echo "正在进行"$i$double_end1$gz_houzhui "&" $i$double_end2$gz_houzhui "文件的Hisat2-samtools......"
hisat2 --new-summary \
-p 32 \
-x ./ref/genome \
-1 ./clean_data/${i}_1_clean.fq.gz \
-2 ./clean_data/${i}_2_clean.fq.gz \
-S ./1.Mapping/sam/$i.sam \
1>./1.Mapping/log/$i.log 2>&1
judge
echo $i$double_end1$gz_houzhui "&" $i$double_end2$gz_houzhui "文件的Hisat2比对已经完成!!!生成"$i".sam文件"
echo "正在进行"$i".sam文件的压缩和排序......"
samtools sort -m 4G -@ 32 -o ./1.Mapping/bam/$i.bam ./1.Mapping/sam/$i.sam
judge
echo $i".sam文件的压缩和排序已经完成!!!生成"$i".bam文件"
rm ./1.Mapping/sam/$i.sam
echo "正在进行"$i".bam文件的bam index......"
samtools index -@ 32 ./1.Mapping/bam/$i.bam
judge
echo $i".bam文件的压缩和排序已经完成!!!生成"$i".bai文件"
echo ""
# echo ""
done
awk '{$1= "所有文件的Hisat2-samtools已经完成!!! 比对结果已存放在"location"/1.Mapping/sam、bam/" "目录下,相关日志文件已经存放在"location"/1.Mapping/log/" "目录下" ; print $0}' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
function hisat2_mapping_count {
#hisat2_mapping_rate*.txt
cp /data/changchuanjun/count_Hisat2_mapping_rate.py ./
python count_Hisat2_mapping_rate.py --the_input_location_of_mapping_log 1.Mapping/log/ --outFileName ./
rm ./count_Hisat2_mapping_rate.py
}
#定义判断参考基因组注释文件是否存在且数量符合要求的函数
function judge_refgenome_gtf_file_and_number {
gtf_file_and_number=$(ls ./ref/gtf_zhushi/*.gtf | wc -l)
if [ $gtf_file_and_number -eq 1 ]; then
echo "参考基因组注释文件存在且唯一,正在进行后续分析...... True"
else
echo "参考基因组注释文件不符合要求,请重新检查!!!"
echo "提示:可能是参考基因组注释文件后缀不为gtf(gff需要转换为gtf)、参考基因组注释文件数量不唯一或不存在等......,请重新检查!!!"
echo "failed"
exit 1
fi
}
#定义表达定量函数
function quantitative_expression {
judge_file ./2.Quantification
mkdir ./2.Quantification/quantitative-result
mkdir ./2.Quantification/quantitative-result/count
mkdir ./2.Quantification/quantitative-result/log
judge_original_file /data/changchuanjun/necessary_packages/RunFeatureCounts
if [ -d "./RunFeatureCounts" ]; then
rm -rf ./RunFeatureCounts
else
cp -r /data/changchuanjun/necessary_packages/RunFeatureCounts ./
fi
#cp -r /data/changchuanjun/necessary_packages/RunFeatureCounts ./
echo ""
echo "5、开始对压缩和排序生成的bam文件进行表达定量!!!"
for i in $(awk '{print $1}' ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt)
do
echo "正在进行"$i".bam文件的表达定量......"
Rscript ./RunFeatureCounts/run-featurecounts.R \
-b ./1.Mapping/bam/$i.bam \
-g ./ref/gtf_zhushi/* \
-f exon \
-a gene_id \
-o ./2.Quantification/quantitative-result/count/$i \
1>./2.Quantification/quantitative-result/log/$i.log 2>&1
judge
echo $i".bam文件的表达定量已经完成!!!生成"$i".count文件和"$i".log文件"
# echo ""
done
#rm -r /data/changchuanjun/necessary_packages/RunFeatureCounts ./
awk '{$1= "所有文件的表达定量已经完成!!! 结果已存放在 " location "/2.Quantification/quantitative-result/count/" " 目录下,相关日志文件(subread)已经存放在" location "/2.Quantification/quantitative-result/log/" "目录下 " ; print $1 }' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
#定义合并矩阵函数
function merge_result {
judge_file ./3.Merge_result
echo ""
echo "6、开始对表达定量生成的count文件进行合并......"
ls ./2.Quantification/quantitative-result/count/*.count > ./3.Merge_result/genes.quant_files.txt
perl ./RunFeatureCounts/abundance_estimates_to_matrix.pl \
--est_method featureCounts \
--quant_files ./3.Merge_result/genes.quant_files.txt \
--out_prefix ./3.Merge_result/genes
judge
mv __tmp_runTMM.R ./3.Merge_result
awk '{$1= "所有表达定量生成的count文件合并已经完成!!! 结果已存放在 " location "/3.Merge_result" " 目录下" ; print $1 }' location=${PWD} ./sample_information_file/gz_end_symbol.txt
echo ""
}
function start_run {
if [ ! -e "process_double-end-data.py" ]; then
cp /data/changchuanjun/rnaseq_upstream_analysis_jiaoben/process_double-end-data.py ./
else
echo "当前目录下已经存在python脚本-->process_double-end-data.py......"
fi
echo "start......"
}
#------------------------------------------------------------------
total_judge
start_run
python process_double-end-data.py
data_fastqc
fastqc_result_multiqc
clean_data_fastp
#判断参考基因组文件夹是否存在
judge_original_file ./ref
refgenome_operation
hisat2_samtools
hisat2_mapping_count
#判断参考基因组注释文件夹是否存在
judge_original_file ./ref/gtf_zhushi
#判断参考基因组注释文件是否存在且数量符合要求
judge_refgenome_gtf_file_and_number
quantitative_expression
merge_result
echo $(date +%Y/%m/%d/%T) | while read id; do /data/Erick_Tong/software/email [email protected] xiezushu转录组数据上游分析完成$id;done
echo $(date +%Y/%m/%d/%T) | while read id; do /data/Erick_Tong/software/email [email protected] xiezushu转录组数据上游分析完成$id;done
bash /data/changchuanjun/wechat_remind.sh "Congratulation!!! ma-experiment-RNASeq上游分析已成功完成!!!"
#**********************************************************************
#分步shell脚本
function jiaoben_make {
mkdir jiaoben
awk '{print "fastqc -t 32 " $2 " -o " location "/fastqc_result/" }' location=${PWD} ./sample_information_file/whole_sample_name.txt > ./jiaoben/fastqc.sh
awk '{print "fastqc -t 32 " $2 " -o " location "/fastqc_result/" " 1>" location "/fastqc_result/" $1 "fastqc.sh.log 2>&1"}' location=${PWD} ./sample_information_file/whole_sample_name.txt > ./jiaoben/log_fastqc.sh
awk '{print "multiqc " location "/fastqc_result/" " -o " location "/multiqc_result" }' location=${PWD} ./sample_information_file/gz_end_symbol.txt > jiaoben/multiqc.sh
awk '{print "multiqc " location "/fastqc_result/" " -o " location "/multiqc_result" " 1>" location "/multiqc_result/multiqc_result.log 2>&1" }' location=${PWD} ./sample_information_file/gz_end_symbol.txt > jiaoben/log_multiqc.sh
awk '{print "fastp -w 32 \\\n" "-i " $2 "_1.fq.gz \\\n" "-I " $2 "_2.fq.gz \\\n" "-o " location "/clean_data/" $1 "_1_clean.fq.gz \\\n" "-O " location "/clean_data/" $1 "_2_clean.fq.gz \\\n" "-h " location "/clean_data/" $1 ".fastp.html \\\n" "-j " location "/clean_data/" $1 ".fastp.json \\\n" }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/fastp.sh
awk '{print "fastp -w 32 \\\n" "-i " $2 "_1.fq.gz \\\n" "-I " $2 "_2.fq.gz \\\n" "-o " location "/clean_data/" $1 "_1_clean.fq.gz \\\n" "-O " location "/clean_data/" $1 "_2_clean.fq.gz \\\n" "-h " location "/clean_data/" $1 ".fastp.html \\\n" "-j " location "/clean_data/" $1 ".fastp.json \\\n" "1> " location "/clean_data/" $1 ".fastp.log \\\n" "2>&1"}' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/log_fastp.sh
awk '{print "hisat2-build " location "/ref/ref_genome/* " location "/ref/genome"}' location=${PWD} ./sample_information_file/gz_end_symbol.txt > jiaoben/hisat2-build.sh
awk '{print "hisat2-build " location "/ref/ref_genome/* " location "/ref/genome " "1> "location "/ref/hisat2-build.log 2>&1"}' location=${PWD} ./sample_information_file/gz_end_symbol.txt > jiaoben/log_hisat2-build.sh
awk '{print "hisat2 --new-summary \\\n" "-p 32 \\\n" "-x " location "/ref/genome \\\n" "-1 " location "/clean_data/" $1 "_1_clean.fq.gz \\\n" "-2 " location "/clean_data/" $1 "_2_clean.fq.gz \\\n" "-S " location "/1.Mapping/sam/" $1 ".sam \\\n" }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/hisat2_mapping.sh
awk '{print "hisat2 --new-summary \\\n" "-p 32 \\\n" "-x " location "/ref/genome \\\n" "-1 " location "/clean_data/" $1 "_1_clean.fq.gz \\\n" "-2 " location "/clean_data/" $1 "_2_clean.fq.gz \\\n" "-S " location "/1.Mapping/sam/" $1 ".sam \\\n" "1> " location "/1.Mapping/sam/" $1 ".sam.log \\\n" "2>&1" }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/hisat2_mapping.sh
awk '{print "samtools sort -m 16G -@ 32 -o " location "/1.Mapping/bam/" $1 ".bam " location "/1.Mapping/sam/" $1 ".sam" }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/samtools_sort.sh
awk '{print "samtools sort -m 16G -@ 32 -o " location "/1.Mapping/bam/" $1 ".bam " location "/1.Mapping/sam/" $1 ".sam " "1> "location "/1.Mapping/bam/" $1 ".bam.log 2>&1"}' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/log_samtools_sort.sh
awk '{print "samtools index -@ 32 " location "/1.Mapping/bam/" $1 ".bam" }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/samtools_index.sh
awk '{print "samtools index -@ 32 " location "/1.Mapping/bam/" $1 ".bam " "1> "location "/1.Mapping/bam/" $1 ".bai.log 2>&1"}' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/log_samtools_index.sh
awk '{print "Rscript " location "/RunFeatureCounts/run-featurecounts.R \\\n" "-b " location "/1.Mapping/bam/" $1 ".bam \\\n" "-g " location "/ref/gtf_zhushi/* \\\n" "-f exon -a gene_id \\\n" "-o " location "/2.Quantification/quantitative-result/" $1 }' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/quantification.sh
awk '{print "Rscript " location "/RunFeatureCounts/run-featurecounts.R \\\n" "-b " location "/1.Mapping/bam/" $1 ".bam \\\n" "-g " location "/ref/gtf_zhushi/* \\\n" "-f exon -a gene_id \\\n" "-o " location "/2.Quantification/quantitative-result/" $1 "\\\n" "1> " location "/2.Quantification/quantitative-result/" $1 ".quantification.log \\\n" "2>&1"}' location=${PWD} ./sample_information_file/remove_double_end_sequence_symbol_and_gz_end_symbol.txt > ./jiaoben/log_quantification.sh
awk '{print "ls ./2.Quantification/quantitative-result/count/*.count > ./3.Merge_result/genes.quant_files.txt"}' ./sample_information_file/gz_end_symbol.txt > jiaoben/merge_count.sh
awk '{print "ls ./2.Quantification/quantitative-result/count/*.count > ./3.Merge_result/genes.quant_files.txt " "1> " location "/3.Merge_result/" "merge_count.log " "2>&1"}' ./sample_information_file/gz_end_symbol.txt > jiaoben/log_merge_count.sh
awk '{print "perl ./RunFeatureCounts/abundance_estimates_to_matrix.pl \\\n" " --est_method featureCounts \\\n" " --quant_files ./3.Merge_result/genes.quant_files.txt \\\n" " --out_prefix ./3.Merge_result/genes \\\n" }' ./sample_information_file/gz_end_symbol.txt > jiaoben/abundance_estimates.sh
awk '{print "perl ./RunFeatureCounts/abundance_estimates_to_matrix.pl \\\n" " --est_method featureCounts \\\n" " --quant_files ./3.Merge_result/genes.quant_files.txt \\\n" " --out_prefix ./3.Merge_result/genes \\\n" " 1> " location "/3.Merge_result/" "abundance_estimates.log " "2>&1"}' ./sample_information_file/gz_end_symbol.txt > jiaoben/log_abundance_estimates.sh
}
jiaoben_make
exit()