generated from Redmar-van-den-Berg/snakemake-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
241 lines (220 loc) · 8.02 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
pepfile: config['pepfile']
config = pep.config.get('HiFi-assembly', dict())
samples = pep.sample_table['sample_name']
include: 'common.smk'
rule all:
input:
assembly = [f'{sample}/assembly/{sample}.assembly.fasta' for sample in samples],
mapped_contigs = [f'{sample}/bamfile/{sample}_contigs.bam' for sample in samples] if 'reference' in config else [],
json = [f'{sample}/blast/{sample}_contigs_blast.json' for sample in samples] if 'genes' in config else [],
ec_bam = [f'{sample}/bamfile/{sample}.ec.bam' for sample in samples] if 'hifiasm-write-ec' in config and 'reference' in config else [],
rule bam_to_fasta:
input:
get_bamfiles
output:
'{sample}/input/{sample}.fasta.gz'
log:
'log/{sample}_bam_to_fasta.txt'
container:
containers['minimap2']
shell: """
# Make sure the output folder exists
mkdir -p {wildcards.sample}
# Remove the output file if it exists, since we will be appending
rm -f {output} {log}
for bamfile in {input}; do
# Gzip files can be appended together
samtools fasta $bamfile 2>> {log} | gzip >> {output}
done
"""
rule assemble:
input:
fasta = rules.bam_to_fasta.output,
output:
# Haplotype-resolved raw unitig graph.
# This graph keeps all haplotype information.
r_utg = hifiasm.r_utg('{sample}'),
# Haplotype-resolved processed unitig graph without small bubbles.
# Small bubbles might be caused by somatic mutations or noise in data,
# which are not the real haplotype information.
p_utg = hifiasm.p_utg('{sample}'),
# Assembly graph of primary contigs. This graph includes a complete
# assembly with long stretches of phased blocks.
# NOTE: The assembly contains occasional phase switching, use with
# extreme caution
p_ctg = hifiasm.p_ctg('{sample}'),
# Alternate assembly contig graph (prefix.a_ctg.gfa). This graph
# consists of all assemblies that are discarded in primary contig graph
a_ctg = hifiasm.a_ctg('{sample}'),
# Partially phased contig graph of haplotype1
hap1 = hifiasm.hap1('{sample}'),
# Partially phased contig graph of haplotype2
hap2 = hifiasm.hap2('{sample}'),
# Error corrected input reads
ec_fasta = '{sample}/assembly/{sample}.ec.fa' if 'hifiasm-write-ec' in config else [],
params:
flags = ' '.join(config['hifiasm-flags']),
threads:
12
log:
'log/{sample}_hifiasm.txt'
container:
containers['hifiasm']
shell: """
hifiasm -o {wildcards.sample}/assembly/{wildcards.sample} \
-t {threads} \
{params.flags} \
{input.fasta} &> {log}
"""
rule assembly_to_fasta:
""" Convert the gfa assembly graph to fasta format """
input:
gfa = get_assembly(),
script = srcdir('scripts/gfa-to-fasta.py')
output:
'{sample}/assembly/{sample}.assembly.fasta'
log:
'log/{sample}_assembly_to_fasta.txt'
container:
containers['pyblast']
shell: """
python3 {input.script} {input.gfa} > {output} 2> {log}
"""
rule map_contigs:
""" Map the assembled contigs against the reference """
input:
contigs = rules.assembly_to_fasta.output,
reference = config.get('reference', '')
output:
mapped_bam = '{sample}/bamfile/{sample}_contigs.bam',
mapped_bai = '{sample}/bamfile/{sample}_contigs.bam.bai',
unmapped_bam = '{sample}/bamfile/{sample}_contigs_unmapped.bam',
unmapped_bai = '{sample}/bamfile/{sample}_contigs_unmapped.bam.bai',
log:
minimap = 'log/{sample}_contigs_minimap.txt',
samtools = 'log/{sample}_contigs_samtools.txt',
container:
containers['minimap2']
threads:
12
shell: """
minimap2 -a {input.reference} {input.contigs} -t {threads} 2> {log.minimap} \
| samtools sort \
| samtools view \
-h \
-b \
-o {output.mapped_bam} \
-U {output.unmapped_bam} \
-F 4 2>{log.samtools}
samtools index {output.mapped_bam}
samtools index {output.unmapped_bam}
"""
rule map_ec_reads:
""" Map the assembled contigs against the reference """
input:
contigs = rules.assemble.output.ec_fasta,
reference = config.get('reference', '')
output:
mapped_bam = '{sample}/bamfile/{sample}.ec.bam',
mapped_bai = '{sample}/bamfile/{sample}.ec.bam.bai',
unmapped_bam = '{sample}/bamfile/{sample}.ec.unmapped.bam',
unmapped_bai = '{sample}/bamfile/{sample}.ec.unmapped.bam.bai',
log:
minimap = 'log/{sample}_map_ec_reads.txt',
samtools = 'log/{sample}_map_ec_reads_samtools.txt',
container:
containers['minimap2']
threads:
12
shell: """
minimap2 -a {input.reference} {input.contigs} -t {threads} 2> {log.minimap} \
| samtools sort \
| samtools view \
-h \
-b \
-o {output.mapped_bam} \
-U {output.unmapped_bam} \
-F 4 2>{log.samtools}
samtools index {output.mapped_bam}
samtools index {output.unmapped_bam}
"""
rule make_blast_db:
""" Create a blast database from the assembled contigs """
input:
contigs = rules.assembly_to_fasta.output,
params:
dbname = lambda wildcards, output: output[0][:-4]
output:
nhr = '{sample}/blast/{sample}.blastdb.nhr',
nin = '{sample}/blast/{sample}.blastdb.nin',
nsq = '{sample}/blast/{sample}.blastdb.nsq',
folder = directory('{sample}/blast'),
log:
'log/{sample}_make_blast_db.txt'
container:
containers['pyblast']
shell: """
mkdir -p {output.folder} 2> {log}
# If the assembly is empty, we need to create a dummy file or
# makeblastdb will crash
input={input}
if [ ! -s $input ]; then
input={wildcards.sample}/blast/dummy.fasta
echo ">dummy" > $input
echo "A" >> $input
fi
makeblastdb \
-input_type fasta \
-dbtype nucl \
-in $input \
-out {params.dbname} 2>> {log}
"""
rule blast_contigs:
""" Run blast against the blast database of the contigs """
input:
blastdb = rules.make_blast_db.output.nhr,
genes = config.get('genes', ''),
params:
dbname = lambda wildcards, input: input[0][:-4]
output:
xml = '{sample}/blast/{sample}_blast.xml'
log:
'log/{sample}_blast_contigs.txt'
container:
containers['pyblast']
shell: """
blastn \
-outfmt 5 \
-query {input.genes} \
-db {params.dbname} \
-out {output} 2> {log}
"""
rule parse_blast_results:
""" Parse the blast results of the genes against the assembled contigs """
input:
blast_results = rules.blast_contigs.output,
contigs = rules.assembly_to_fasta.output,
genes = config.get('genes', ''),
folder = rules.make_blast_db.output.folder,
script = srcdir('scripts/parse-blast.py')
params:
blast_output = config['blast-output']
output:
json = '{sample}/blast/{sample}_contigs_blast.json',
log:
'log/{sample}_parse_blast_result.txt'
container:
containers['pyblast']
shell: """
# Make sure there are no fasta files, since script appends to output files
rm -f {input.folder}/*.fasta &> {log}
python3 {input.script} \
--database {input.blast_results} \
--query {input.genes} \
--json {output.json} \
--genes {input.folder} \
--gene-prefix {wildcards.sample} \
--contigs {input.contigs} \
--blast-output {params.blast_output} \
&>> {log}
"""