-
Notifications
You must be signed in to change notification settings - Fork 1
/
assembly_and_mapper_script.sh
152 lines (116 loc) · 5.18 KB
/
assembly_and_mapper_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# assembly_and_mapper_script.sh
# a bash script for de-novo assembly of RAD-tags (dDocent-style) from demultiplexed, single-end fastq files and mapping of samples to the de-novo reference
# Copyright (C) 2017, ETH Zurich, Mathias Scharmann
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# If you use this code please cite:
#
# "Scharmann M, Grafe TU, Metali F, Widmer A. (2017) Sex-determination
# and sex chromosomes are shared across the radiation of dioecious
# Nepenthes pitcher plants. XXX"
# AND/OR
# "Scharmann M, Metali F, Grafe TU, Widmer A. (2017) Divergence with
# gene flow among multiple sympatric carnivorous Nepenthes pitcher
# plants is linked to trap morphology. XXX"
#
# contact: mathias.scharmann[-at-]env.ethz.ch or msph52[-at-]gmail.com
## the original dDocent comes with an MIT licence which is included here:
# The MIT License (MIT)
#
# Copyright (c) 2013 jpuritz
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# start dDocent-like assembly:
# concatenate everything
zcat *.fq.gz | mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' > concat.fasta
#Convert fasta format to just sequences
mawk '!/>/' concat.fasta > concat.seq
#Find every unique sequence and count the number of occurrences
perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' concat.seq > uniq.seqs
#Create a data file with the number of unique sequences and the number of occurrences
ONE=$(mawk '$1 >= 2' uniq.seqs | wc -l)
echo -e "2""\t""$ONE" > uniqseq.data
for ((i = 3; i <= 50; i++));
do
J=$(mawk -v x=$i '$1 >= x' uniq.seqs | wc -l)
echo -e "$i""\t""$J" >> uniqseq.data
done
#Function to convert a file of unique sequences to fasta format
uniq2fasta()
{
i=1
cat $1 | while read line
do
echo ">Contig"$i
echo $line
i=$(($i + 1))
done
}
#Main function for assembly
Assemble()
{
# main arguments: $CUTOFF, $vsearch_id, $simC
# secondary arguments: $NUMProc
mawk -v x=$CUTOFF '$1 >= x' uniq.seqs | cut -f 2 > totaluniqseq
#Convert reads to fasta
uniq2fasta totaluniqseq > uniq.fasta
# instead of rainbow which wants PE data, we cluster with vsearch like pyRAD does:
vsearch --fasta_width 0 --threads $NUMProc --id $vsearch_id --cluster_fast uniq.fasta --msaout vsearch_msaout.txt
# instead of rainbow merge and best contig selection (which do not work with single-end data
# use my python script:
python ./vsearch_msaout_global_align_muscle.v2.py --vsearch_msa vsearch_msaout.txt
#cd-hit to cluster reads based on sequence similarity
cd-hit-est -i vsearch_muscle.fasta -o referenceRC.fasta -M 0 -T $NUMProc -c $simC &>cdhit.log
#seqtk seq -r referenceRC.fasta > reference.fasta.original
#rm referenceRC.fasta
mv referenceRC.fasta reference.fasta.original
sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
samtools faidx reference.fasta
bwa index reference.fasta
}
CUTOFF=10
NUMProc=12
simC=0.9
vsearch_id=0.9
Assemble
echo Done!
######
######################
# mapping with bwa and SNP calling in freebayes (ignore invariant sites!)
######################
for i in $( ls *.fq.gz | sed 's/.fq.gz//g' ) ; do
bwa mem -a reference.fasta $i.fq.gz -L 20,5 -t $NUMProc -M -T 10 -R "@RG\tID:$i\tSM:$i\tPL:Illumina" 2> bwa.$i.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@$NUMProc -q 1 -SbT reference.fasta - > $i.bam 2>$i.bam.log
samtools sort -@$NUMProc $i.bam -o $i.sorted
rm $i.bam
mv $i.sorted $i-RG.bam
samtools index $i-RG.bam
done
rm *.bam.log bwa.*.log
rm concat.fasta concat.seq uniq.seqs totaluniqseq uniq.fasta vsearch_msaout.txt vsearch_muscle.fasta