-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathCp-hap.sh
158 lines (137 loc) · 3.97 KB
/
Cp-hap.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash
#if this script has every error, exit
set -e
SOURCE=$(dirname $0})/scripts/
usage()
{
echo '''
[Cp-hap, chloroplast genome haplotype Dection, detect chloroplast genome structural heteroplasmy using long-reads]
Usage: bash Cp-hap.sh -r reads -g chloroplastGenome.fa -o outputDir [options]
Required:
-r the path of long-read file in fa/fq format, can be compressed(.gz).
-g the path of chloroplast genome, chloroplast genome should be in fa format, not gzip. The chloroplast genome file should only have three sequences, named as 'lsc', 'ssc' and 'ir' (see testData/Epau.format.fa as an example). It does not matter which oritentation is for lsc, ssc and ir.
-o the path of outputDir.
Options:
-t number of threads. Default is 1.
-x readType, only can be map-pb (PacBio reads) or map-ont (Nanopore reads). Default is map-pb.
-d minimun distance of exceeding the first and last conjunctions (such as lsc/ir and ir/ssc). 1 means 1 bp, 1000 means 1 kb. Default is 1000.
'''
}
#set default
threads=1
readType='map-pb'
minDistance=1000
#regular expression, test whether some arguments are integer of float
intRe='^[0-9]+$'
#get arguments
while getopts ":hr:g:o:t:d:x:" opt
do
case $opt in
g)
chloroplastGenome=$OPTARG
if [ ! -f "$chloroplastGenome" ]
then
echo "ERROR: $chloroplastGenome is not a file"
exit 1
fi
;;
r)
reads=$OPTARG
if [ ! -f "$reads" ]
then
echo "ERROR: $reads is not a file"
exit 1
fi
;;
o)
outputDir=$OPTARG
;;
t)
threads=$OPTARG
if ! [[ $threads =~ $intRe ]]
then
echo "ERROR: threads should be an integer, $threads is not an integer"
exit 1
fi
;;
d)
minDistance=$OPTARG
if ! [[ $minDistance =~ $intRe ]]
then
echo "ERROR: minDistance should be an integer, $minDistance is not an integer"
exit 1
fi
;;
x)
readType=$OPTARG
if [ $readType != 'map-pb' -a $readType != 'map-ont' ]
then
echo "ERROR: readType must be map-pb or map-ont."
exit 1
fi
;;
h)
usage
exit 0
;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
usage
exit 1
;;
esac
done
#test whether minimap2 is in the path
if ! [ -x "$(command -v minimap2)" ]
then
echo 'ERROR: minimap2 did not be found, please add it into the path (e.g "export PATH=/path/of/script/:$PATH") before running this script'
exit 1
fi
#check whether set the required arguments
if [ -z "$chloroplastGenome" ] || [ -z "$reads" ] || [ -z "$outputDir" ]
then
echo "ERROR: -g or -r or -o has not been set"
usage
exit 1
fi
mkdir -p $outputDir
echo 'Parameters:'
echo 'ChloroplastGenome:' $chloroplastGenome
echo 'Input long-read:' $reads
echo 'Read type:' $readType
echo 'OutputDir:' $outputDir
echo 'Threads' $threads
echo 'MinDistance' $minDistance
#minimap2 output
minimapOutput=$outputDir/$(basename ${reads%.*}).paf
#combinations of different directions of single copy
reference=$outputDir/dir_directions_$(basename ${chloroplastGenome%.*})
#final output result
outputFile=$outputDir/result_$(basename ${reads%.*})_$(basename ${chloroplastGenome%.*})
#get combinations of different direction of single copy
echo "creating different references"
python $SOURCE/getDifferentDirectionCombine.py \
$chloroplastGenome \
$reference
#run minimap2
echo "mapping long-reads to reference using minimap2"
minimap2 \
-x $readType \
--secondary=no \
-t $threads \
-L \
-c \
$reference \
$reads > $minimapOutput
#check orientation ratio
echo "parsing result"
python $SOURCE/parse.py \
$chloroplastGenome \
$outputFile \
$minimapOutput \
$minDistance