-
Notifications
You must be signed in to change notification settings - Fork 15
/
preprocess.sh
118 lines (111 loc) · 5.14 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash
#
#
# This is the script for preprocessing.
#
# Quick guide:
# ./preprocess.sh <inputfile> <inputtype bam/sam/fastq/fasta> <platform iontor/illumina/454> <length_cutoff> <left_cutoff> <right_cutoff>
#
#
### Authors : Yang Li <[email protected]>
### License : GPL 3 <http://www.gnu.org/licenses/gpl.html>
### Update : 2015-08-05
### Copyright (C) 2015 Yang Li and Xuejun Ma - All Rights Reserved
if [ $# -lt 7 ]
then
echo "usage: $0 <inputfile> <inputtype bam/sam/fastq/fasta> <platform iontor/illumina/454> <length_cutoff> <left_cutoff> <right_cutoff> <quality_cutoff>"
echo -e ""
exit 65
fi
########
INPUT=$1
format=$2
platform=$3
length_cutoff=$4
left_cutoff=$5
right_cutoff=$6
quality_cutoff=$7
########
########
if [ "$format" = "bam" ] || [ "$format" = "sam" ] || [ "$format" = "fastq" ] || [ "$format" = "fasta" ]
then
echo -e "The format of Input file is $format"
else
echo "$0:NOT A VALID FILE FORMAT. Please check the format of the input file. Currently the format supported by this pipeline are : sam/bam/fastq/fasta"
exit 65
fi
if [ "$platform" = "illumina" ] || [ "$platform" = "iontor" ] || [ "$platform" = "454" ]
then
echo -e "The file generated by $platform platform"
else
echo -e "The file generated by $platform platform cannot be supported. Please check the platform. Currently the format supported by this pipeline are : iontor/illumina/454"
exit 65
fi
echo -e "$(date)\t$0\tPreprocess for file $INPUT begins"
if [ "$format" = "bam" ] || [ "$format" = "sam" ]
then
echo -e "$0\tBegin to transformat"
picard-tools SamToFastq.jar INPUT=$INPUT FASTQ=$INPUT.fq
total_num=$(wc -l $INPUT.fq | awk '{print $1}')
let "num=$total_num/4"
echo -e "The number of reads are $num"
echo -e "$0\tBegin to Quality Control"
#seqtk trimfq -b 20 $INPUT.fq > $INPUT.end
seqtk trimfq -b "$left_cutoff" -e "$right_cutoff" $INPUT.fq > $INPUT.end
num_total_reads=`prinseq-lite.pl -stats_info -fastq $INPUT.fq | grep "reads" | awk '{print$3}'`
#mv $INPUT.fq
if [ "$platform" = "illumina" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
elif [ "$platform" = "iontor" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
elif [ "$platform" = "454" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
fi
elif [ "$format" = "fastq" ]
then
echo "$0\tBegin to quality control"
total_num=$(wc -l $INPUT | awk '{print $1}')
let "num=$total_num/4"
echo -e "The number of reads are $num"
echo -e "$0\tBegin to Quality Control"
num_total_reads=`prinseq-lite.pl -stats_info -fastq $INPUT | grep "reads" | awk '{print$3}'`
echo "seqtk trimfq -b $left_cutoff -e $right_cutoff $INPUT > $INPUT.end"
seqtk trimfq -b $left_cutoff -e $right_cutoff $INPUT > $INPUT.end
if [ "$platform" = "illumina" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
elif [ "$platform" = "iontor" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
elif [ "$platform" = "454" ]
then
prinseq-lite.pl -min_len "$length_cutoff" -min_qual_mean "$quality_cutoff" noniupac -lc_method dust -lc_threshold 7 -fastq $INPUT.end -out_good $INPUT.preprocessed
fi
elif [ "$format" = "fasta" ]
then
echo -e "$0\t[WARN:] Steps related to qual cannot be applied"
echo -e "$0\tBegin to Quality Control"
num_total_reads=`prinseq-lite.pl -stats_info -fasta $INPUT | grep "reads" | awk '{print$3}'`
if [ "$platform" = "illumina" ]
then
prinseq-lite.pl -trim_left "$left_cutoff" -trim_right "$right_cutoff" -min_len "$length_cutoff" noniupac -lc_method dust -lc_threshold 7 -fasta $INPUT -out_good $INPUT.preprocessed
elif [ "$platform" = "iontor" ]
then
prinseq-lite.pl -trim_left "$left_cutoff" -trim_right "$right_cutoff" -min_len "$length_cutoff" noniupac -lc_method dust -lc_threshold 7 -fasta $INPUT -out_good $INPUT.preprocessed
elif [ "$platform" = "454" ]
then
prinseq-lite.pl -trim_left "$left_cutoff" -trim_right "$right_cutoff" -min_len "$length_cutoff" noniupac -lc_method dust -lc_threshold 7 -fasta $INPUT -out_good $INPUT.preprocessed
fi
num_preprocessed_reads=`prinseq-lite.pl -stats_info -fasta $INPUT.preprocessed.fasta | grep "reads" | awk '{print$3}'`
fi
chmod 777 *.*
num_preprocessed_reads=`prinseq-lite.pl -stats_info -fastq $INPUT.preprocessed.fastq | grep "reads" | awk '{print$3}'`
low_quality_reads=$(( $num_total_reads - $num_preprocessed_reads))
#sudo sh -c 'echo -e "Total_sequences\t$num_total_reads" > $inputfile.reads_distribution'
echo -e "Total_sequences\t$num_total_reads" > $INPUT.reads_distribution
echo -e "Low_quality_reads\t$low_quality_reads" >> $INPUT.reads_distribution
#sudo sh -c 'echo -e "Low_quality_reads\t$low_quality_reads" >> $inputfile.reads_distribution'
#sudo rm -f