forked from abdelrahmanMA/ngs1_project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdated_codes.txt
211 lines (163 loc) · 7.47 KB
/
updated_codes.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#Install packages required
source activate ngs1
conda install -c bioconda sra-tools
conda install -c bioconda seqkit
conda install r
conda install -y bioconductor-deseq r-gplots
#Create a new file directory for the assignment
mkdir assign && cd assign
#Download dataset SRA file
wget -c ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR879/SRR8797509/SRR8797509.sra
# Converting SRA to FASTQ files
mkdir fastq_files && cd fastq_files
fastq-dump --split-files -X 5000000 ~/assign/SRR8797509.sra
# Splitting unshuffled samples
cd..
mkdir unshuffled_samples && cd unshuffled_samples
seqkit split2 -1 SRR8797509_1.fastq -2 SRR8797509_2.fastq -p 5 -O out -f
#QC for the unshuffled sample one
mkdir fastqc_reports && cd fastqc_reports
for f in ~/assign/unshuffled_samples/out/SRR8797509_*.part_001.fastq;do
fastqc -t 1 -f fastq -noextract -o ~/assign/unshuffled_samples/fastqc_reports $f;done
multiqc -z -o . .
#Shuffling 5M reads and splitting to 5 samples
cd..
mkdir shuffled_samples && cd shuffled_samples
seqkit shuffle ~/assign/fastq_files/SRR8797509_1.fastq >shuffled_1
seqkit shuffle ~/assign/fastq_files/SRR8797509_2.fastq >shuffled_2
seqkit split2 -1 shuffled_1.fastq -2 shuffled_2.fastq -p 5 -O out -f
#QC for shuffled sample one
mkdir fastqc_reports && cd fastqc_reports
for f in ~/assign/shuffled_samples/out/shuffled_*.part_001.fastq;do
fastqc -t 1 -f fastq -noextract -o ~/assign/shuffled_samples/fastqc_reports $f;done
multiqc -z -o . .
#Trimming unshuffled reads:
cd ..
cd unshuffled_samples
mkdir mild_trim && cd mild_trim
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.fastq"
R2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.fastq"
newf1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.pe.trim.fastq"
newf2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.pe.trim.fastq"
newf1U="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.se.trim.fastq"
newf2U="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.se.trim.fastq"
adap="/home/nehal/miniconda3/envs/ngs1/share/trimmomatic-0.39-1/adapters"
trimmomatic PE -threads 1 -phred33 -trimlog trimLogFile -summary statsSummaryFile $R1 $R2 $newf1 $newf1U $newf2 $newf2U \
ILLUMINACLIP:$adap/TruSeq3-PE.fa:2:30:10:1 SLIDINGWINDOW:4:15 MINLEN:36
done
#Trimming shuffled reads
#I increased the minimum length, widened the sliding window and raised the required quality for aggressive trimming
cd
cd assign/shuffled_samples
mkdir agg_trim && cd agg_trim
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.fastq"
R2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.fastq"
newf1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.pe.trim.fastq"
newf2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.pe.trim.fastq"
newf1U="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.se.trim.fastq"
newf2U="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.se.trim.fastq"
adap="/home/nehal/miniconda3/envs/ngs1/share/trimmomatic-0.39-1/adapters"
trimmomatic PE -threads 1 -phred33 -trimlog trimLogFile -summary statsSummaryFile $R1 $R2 $newf1 $newf1U $newf2 $newf2U \
ILLUMINACLIP:$adap/TruSeq3-PE.fa:2:30:10:1 SLIDINGWINDOW:6:18 MINLEN:38
done
#BWA alignment for unshuffled samples
#Indexing
cd
mkdir -p ~/assign/bwa_align/bwaIndex && cd ~/assign/bwa_align/bwaIndex
ln -s ~/workdir/sample_data/chr22_with_ERCC92.fa .
bwa index -a bwtsw chr22_with_ERCC92.fa
#Alignment
cd ..
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.pe.trim.fastq"
R2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.pe.trim.fastq"
/usr/bin/time -v bwa mem bwaIndex/chr22_with_ERCC92.fa $R1 $R2 > SRR8797509_part_00${SAMPLE}.sam
done
#Hisat alignment for shuffled reads
#Indexing (Already done before)
cd ..
mkdir -p ~/assign/hisat_align && cd ~/assign/hisat_align
ln -s ~/workdir/hisat_align/hisatIndex
#Alignment
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.pe.trim.fastq"
R2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.pe.trim.fastq"
hisat2 -p 1 -x hisatIndex/chr22_with_ERCC92 --dta --rna-strandness RF -1 $R1 -2 $R2 -S shuff_SRR8797509_part_00${SAMPLE}.sam
done
#Assembly_unshuffled samples
cd ..
cd bwa_align
for SAMPLE in 1 2 3 4 5
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.sam"
samtools view -bS $R > SRR8797509_part_00${SAMPLE}.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.bam"
samtools sort $R -o SRR8797509_part_00${SAMPLE}.sorted.bam
done
#with previous known annotations
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_sup_${SAMPLE} -G ~/workdir/sample_data/chr22_with_ERCC92.gtf -o ref_sup_${SAMPLE}.gtf
done
#without known annotation
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_free_${SAMPLE} -o ref_free_${SAMPLE}.gtf
done
#Assembly_shuffled samples
cd ..
cd hisat_align
for SAMPLE in 1 2 3 4 5
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.sam"
samtools view -bS $R > shuff_SRR8797509_part_00${SAMPLE}.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.bam"
samtools sort $R -o shuff_SRR8797509_part_00${SAMPLE}.sorted.bam
done
#with previous known annotations
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_sup_${SAMPLE} -G ~/workdir/sample_data/chr22_with_ERCC92.gtf -o ref_sup_${SAMPLE}.gtf
done
#without known annotation
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_free_${SAMPLE} -o ref_free_${SAMPLE}.gtf
done
#GTF Compare Unshuffled
#In a new terminal
source activate ngs-gtf
mkdir -p ~/assign/gtf-compare-unshuff/gtfs && cd ~/assign/gtf-compare-unshuff/gtfs
ln -s ~/assign/bwa_align/ref_sup_*.gtf .
ln -s ~/assign/bwa_align/ref_free_*.gtf .
ln -s ~/workdir/gtf-compare/comp.py .
ln -s ~/workdir/gtf-compare/stat.py .
python comp.py -r ../gtfs/ref_sup_*.gtf ../gtfs/ref_free_*.gtf
python stat.py
#GTF Compare Shuffled
mkdir -p ~/assign/gtf-compare-shuff/gtfs && cd ~/assign/gtf-compare-shuff/gtfs
ln -s ~/assign/hisat_align/ref_sup_*.gtf .
ln -s ~/assign/hisat_align/ref_free_*.gtf .
ln -s ~/workdir/gtf-compare/comp.py .
ln -s ~/workdir/gtf-compare/stat.py .
python comp.py -r ../gtfs/ref_sup_*.gtf ../gtfs/ref_free_*.gtf
python stat.py
#I stopped here as I received this error ValueError: IntervalTree: Null Interval objects not allowed in IntervalTree: Interval(10941780, 10940612, '-')
#I also had a problem downloading subread package CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://conda.anaconda.org/bioconda/linux-64/subread-1.6.4-h84994c4_1.tar.bz2>
#ThankYou