-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathko_annotation.snake
225 lines (180 loc) · 8.02 KB
/
ko_annotation.snake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import sys
import yaml
import glob
import os.path
import tempfile
from itertools import chain
from functools import partial
from collections import defaultdict
from psutil import virtual_memory
from subprocess import Popen, PIPE
from snakemake.exceptions import WorkflowError
from os.path import basename,dirname,abspath,realpath, getsize
from scripts.common import detect_reads, fill_default_values, extended_glob, replace_extensions, get_extension, get_resource_real
# taken from enrichm
# https://github.com/geronimp/enrichM/blob/master/enrichm/genome.py
# https://github.com/geronimp/enrichM/blob/master/enrichm/genome.py
ROOT = config["ROOT"]
KO_HMM = config["KO_HMM"]
KO_HMM_CUTOFFS = config["KO_HMM_CUTOFFS"]
SCRIPTS = config["scripts"]
CONFIG_PATH = config["CONFIG_PATH"]
config2 = yaml.full_load(open(CONFIG_PATH))
# ------- Cluster resources -------
# config should be in G, but this will be compared with info in mb
SLURM_PARTITIONS = [[specs["name"],1000*int(specs["min_mem"]),1000*int(specs["max_mem"]),int(specs["min_threads"]),int(specs["max_threads"])] for _,specs in config2["slurm_partitions"].items() if _]
# need to create a partial function because rules takes a function
def get_resource(mode, **kwargs):
return partial(get_resource_real, SLURM_PARTITIONS=SLURM_PARTITIONS, mode=mode, **kwargs)
# example launching the snakemake make:
# snakemake -s /mnt/gpfs/seb/Applications/Snakescripts/faster_koannotation.snake --cores 500 CD_TREAT/profile/cov_KEGG.tsv
rule results:
input: "%s/contigs_KEGG_best_hits.tsv"%ROOT
rule koannotation:
input: "{path}.faa"
output: "{path}_ko.out"
threads: 5
resources:
slurm_partition = get_resource("partition"),
mem_mb = get_resource("mem"),
singularity: "docker://quay.io/annacprice/gtdbtk:1.4.0"
shell: """
if [ -s {input} ]
then
hmmsearch --cpu {threads} -o /dev/null --noali --domtblout {output} {KO_HMM} {input}
fi
touch {output}
"""
rule parse_results:
input: file = "{path}_ko.out"
output: out = "{path}_ko.tsv"
log: "{path}_ko.log"
resources:
slurm_partition = get_resource("partition"),
mem_mb = get_resource("mem"),
run:
orf_to_ann = generate_orf_annotation(input["file"])
with open(output["out"],"w") as handle:
handle.writelines("%s\t%s\n"%(orf,"\t".join(ann)) for orf,ann in orf_to_ann.items())
# except Exception as e:
# with open(log[0], 'w') as handle:
# handle.write(traceback.format_exc())
# raise
# use metahood standard spliting of orfs for parallelisation of annotation
rule annotation_kofamscan :
input : expand("{{path}}/temp_splits/Batch_{nb}_ko.tsv",nb=range(100))
output : out = "{path}/contigs_KEGG_best_hits.tsv"
log: "{path}/KEGG_best_hits.log"
resources:
slurm_partition = get_resource("partition"),
mem_mb = get_resource("mem"),
run :
try:
# get KO definition
ko_to_def = {line.rstrip().split("\t")[0]:line.rstrip().split("\t")[-1] for line in open(KO_HMM_CUTOFFS)}
# parse and write results at the same time
with open(output["out"],"w") as handle:
handle.write("%s\n"%'\t'.join(['orf',"KO","KO definition"]))
for file in input:
for line in open(file):
sline = line.rstrip().split("\t")
orf = sline[0]
handle.writelines("%s\t%s\t%s\n"%(orf,KO,ko_to_def[KO]) for KO in sline[1:])
except Exception as e:
with open(log[0], 'w') as handle:
handle.write(traceback.format_exc())
raise
def parse_ko_cutoffs():
cut_ko = {}
out_io = open(KO_HMM_CUTOFFS)
_ = out_io.readline()
for line in out_io:
sline = line.strip().split('\t')
if sline[1] == '-':
cut_ko[sline[0]] = [0.0, "NA"]
else:
cut_ko[sline[0]] = [float(sline[1]), sline[2]]
return cut_ko
def from_hmmsearch_results(hmmsearch_output_path,
evalue_cutoff=1e-05,
bitscore_cutoff=0,
percent_aln_query_cutoff=0,
percent_aln_reference_cutoff=0,
acc = True):
'''
Parse input hmmsearch file
Parameters
----------
hmmsearch_output_path - String. Path to domtblout file containing
hmmsearch results.
evalue_cutoff - Float. E-value threshold for annotations.
bitscore_cutoff - Float. Bit score threshold for annotations.
percent_aln_query_cutoff - Float. Threshold for the percent of the query
that must be aligned to consider the annotation.
percent_aln_reference_cutoff - Float. Threshold for the percent of the reference
that must be aligned to consider the annotation.
Yields
------
A sequence name, accession, E-value and region hit for every annotation result in
blast_output_path that pass a set of cutoffs
'''
# extract specific cutoffs:
specific_cutoffs = parse_ko_cutoffs()
# Filling in column
for line in open(hmmsearch_output_path):
# Skip headers
if line.startswith('#'): continue
# Parse HMMsearch line. '_'s represent unimportant entries. Line
# is trimmed using [:22] to remove sequence description
# try:
seqname, _, tlen, ko_hmm, accession, qlen, _, score, \
_, _, _, _, i_evalue, dom_score, _, _, \
_, seq_from, seq_to, _, _, _ = line.strip().split()[:22]
# except:
# print("issue with formating for line \n%s"%line)
# continue
# Determine sequence and HMM spans
seq_list = [int(seq_from), int(seq_to)]
# Calculate percent of the query and reference aligned to each-other.
perc_seq_aln = (max(seq_list)-min(seq_list))/float(tlen)
perc_hmm_aln = (max(seq_list)-min(seq_list))/float(qlen)
if acc:
accession = ko_hmm
# If the annotation passes the specified cutoffs
if specific_cutoffs:
if ko_hmm in specific_cutoffs:
if specific_cutoffs[ko_hmm][1]=='full':
if float(score) < specific_cutoffs[ko_hmm][0]:
continue
elif specific_cutoffs[ko_hmm][1] == 'domain':
if float(dom_score) < specific_cutoffs[ko_hmm][0]:
continue
if(float(i_evalue) <= evalue_cutoff and
float(score) >= bitscore_cutoff and
perc_seq_aln >= percent_aln_query_cutoff and
perc_hmm_aln >= percent_aln_reference_cutoff):
yield seqname, [accession], float(i_evalue), range(min(seq_list), max(seq_list))
def generate_orf_annotation(file):
# get the hits under the cutoffs
orf_to_annotations = defaultdict(list)
iterator = from_hmmsearch_results(file)
for orf,[accession],evalue,Range in iterator:
orf_to_annotations[orf].append([accession,evalue,Range])
# only keep things with a unique hit as a first pass
orf_to_ann = {orf:[val[0][0]] for orf,val in orf_to_annotations.items() if len(val)==1}
# for the rest, sort hits by evalue, keep the best and any non overlaping ones
issues = set(orf_to_annotations.keys())-set(orf_to_ann.keys())
for orf in issues:
accessions = sorted(orf_to_annotations[orf],key=lambda x:x[1])
real_acc = []
to_del = set()
for index,(acc,evalue,range_best) in enumerate(accessions):
if acc in to_del:
continue
real_acc.append(acc)
for (acc_cand,evalue,range_candidat) in accessions[index:]:
if set(range_best).intersection(range_candidat):
to_del.add(acc_cand)
continue
orf_to_ann[orf]=real_acc
return orf_to_ann