-
Notifications
You must be signed in to change notification settings - Fork 0
/
pfam_scan.pl
executable file
·350 lines (248 loc) · 11 KB
/
pfam_scan.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/bin/env perl
# $Id: pfam_scan.pl 9592 2017-02-23 09:32:06Z jaina $
use strict;
use warnings;
use File::Basename; # add by Irene Jie, 200313, read the path of local lib that need.
BEGIN{push(@INC, dirname($0))}; # add by Irene Jie
use Bio::Pfam::Scan::PfamScan;
use Getopt::Long;
my $VERSION = "1.6";
#-------------------------------------------------------------------------------
# get the user options
my ( $outfile, $e_seq, $e_dom, $b_seq, $b_dom, $dir,
$clan_overlap, $fasta, $align, $help, $as, $pfamB,
$json, $only_pfamB, $cpu, $translate );
GetOptions( 'help' => \$help,
'outfile=s' => \$outfile,
'e_seq=f' => \$e_seq,
'e_dom=f' => \$e_dom,
'b_seq=f' => \$b_seq,
'b_dom=f' => \$b_dom,
'dir=s' => \$dir,
'clan_overlap' => \$clan_overlap,
'fasta=s' => \$fasta,
'align' => \$align,
'h' => \$help,
'as' => \$as,
'pfamB' => \$pfamB,
'only_pfamB' => \$only_pfamB,
'json:s' => \$json,
'cpu=i' => \$cpu,
'translate:s' => \$translate
);
help() if $help;
help() unless ( $dir and $fasta ); # required options
my $pfamA;
if ( $only_pfamB or $pfamB ) {
die qq(FATAL: As of release 28.0, Pfam no longer produces Pfam-B. The -pfamB and -only_pfamB options are now obsolete.\n);
$pfamB=1;
}
else {
$pfamA=1;
}
my @hmmlib;
push @hmmlib, 'Pfam-A.hmm' if $pfamA;
push @hmmlib, 'Pfam-B.hmm' if $pfamB;
#-------------------------------------------------------------------------------
# check the input parameters
die qq(FATAL: must specify both "-dir" and "-fasta")
unless ( defined $dir and defined $fasta );
die qq(FATAL: can't find directory "$dir")
unless -d $dir;
die qq(FATAL: can't find file "$fasta")
unless -s $fasta;
foreach my $hmmlib ( @hmmlib ) {
die qq(FATAL: can't find "$hmmlib" and/or "$hmmlib" binaries and/or "$hmmlib.dat" file in "$dir")
unless ( -s "$dir/$hmmlib" and
-s "$dir/$hmmlib.h3f" and
-s "$dir/$hmmlib.h3i" and
-s "$dir/$hmmlib.h3m" and
-s "$dir/$hmmlib.h3p" and
-s "$dir/$hmmlib.dat" );
}
die qq(FATAL: can't use E-value or bit score threshold with Pfam-B searches; Pfam-B searches use a default cut_off of 0.001)
if ( ( $e_seq or $e_dom or $b_seq or $b_dom ) and not $pfamA );
die qq(FATAL: can't use E-value and bit score threshold together)
if ( ( $e_seq and ( $b_seq or $b_dom ) ) or
( $b_seq and ( $e_seq or $e_dom ) ) or
( $b_dom and $e_dom ) );
die qq(FATAL: output file "$outfile" already exists)
if ( $outfile and -s $outfile );
if ( $as ) {
die qq(FATAL: "-as" option only works on Pfam-A families)
unless $pfamA;
die qq(FATAL: can't find "active_site.dat" in "$dir")
unless -s "$dir/active_site.dat";
}
if ( defined $translate ) {
if ( $translate eq "" ) {
# no argument to "-translate" was given, so make "orf" the default
$translate = 'orf';
}
else {
# there was an argument to "-translate", so make sure it's valid
unless ( $translate eq "all" or $translate eq "orf" ) {
die qq(FATAL: "-translate" option accepts only "all" and "orf");
}
}
}
#-------------------------------------------------------------------------------
# build the object
my $ps = Bio::Pfam::Scan::PfamScan->new(
-e_seq => $e_seq,
-e_dom => $e_dom,
-b_seq => $b_seq,
-b_dom => $b_dom,
-dir => $dir,
-clan_overlap => $clan_overlap,
-fasta => $fasta,
-align => $align,
-as => $as,
-hmmlib => \@hmmlib,
-version => $VERSION,
-cpu => $cpu,
-translate => $translate
);
# run the search
$ps->search;
# print the results
if ( defined $json ) {
my $json_object;
eval {
require JSON;
$json_object = new JSON;
};
if ( $@ ) {
die qq(FATAL: can't load JSON module; can't write JSON-format output);
}
if ( $json eq 'pretty' ) {
$json_object->pretty( 1 ) ;
}
print $json_object->encode( $ps->results );
}
else {
$ps->write_results( $outfile, $e_seq, $e_dom, $b_seq, $b_dom );
}
exit;
#-------------------------------------------------------------------------------
sub help {
print STDERR <<EOF;
pfam_scan.pl: search a FASTA file against a library of Pfam HMMs
Usage: pfam_scan.pl -fasta <fasta_file> -dir <directory location of Pfam files>
Additonal options:
-h : show this help
-outfile <file> : output file, otherwise send to STDOUT
-clan_overlap : show overlapping hits within clan member families (applies to Pfam-A families only)
-align : show the HMM-sequence alignment for each match
-e_seq <n> : specify hmmscan evalue sequence cutoff for Pfam-A searches (default Pfam defined)
-e_dom <n> : specify hmmscan evalue domain cutoff for Pfam-A searches (default Pfam defined)
-b_seq <n> : specify hmmscan bit score sequence cutoff for Pfam-A searches (default Pfam defined)
-b_dom <n> : specify hmmscan bit score domain cutoff for Pfam-A searches (default Pfam defined)
-as : predict active site residues for Pfam-A matches
-json [pretty] : write results in JSON format. If the optional value "pretty" is given,
the JSON output will be formatted using the "pretty" option in the JSON
module
-cpu <n> : number of parallel CPU workers to use for multithreads (default all)
-translate [mode] : treat sequence as DNA and perform six-frame translation before searching. If the
optional value "mode" is given it must be either "all", to translate everything
and produce no individual ORFs, or "orf", to report only ORFs with length greater
than 20. If "-translate" is used without a "mode" value, the default is to
report ORFs (default no translation)
For more help, check the perldoc:
shell\% perldoc pfam_scan.pl
EOF
exit;
}
#-------------------------------------------------------------------------------
=head1 NAME
pfam_scan.pl -- Search protein sequences against the Pfam HMM library
=head1 SYNOPSIS
pfam_scan.pl [options] -fasta <fasta_file> -dir <Pfam_data_file_dir>
=head1 OPTIONS
=over
=item B<-dir> I<Pfam_data_file_dir>
Directory containing Pfam data files [required]
=item B<-fasta> I<fasta_file>
Filename of input file containing sequence(s) [required]
=item B<-outfile> I<output_file>
Write output to C<output_file> [default: STDOUT]
=item B<-e_seq>
Sequence E-value cut-off [default: use Pfam GA cutoff]
=item B<-e_dom>
Domain E-value cut-off [default: use Pfam GA cutoff]
=item B<-b_seq>
Sequence bits score cut-off [default: use Pfam GA cutoff]
=item B<-b_dom>
Domain bits score cut-off [default: use Pfam GA cutoff]
=item B<-clan_overlap>
Allow sequences in different clans to overlap [default: false]
=item B<-align>
Show alignment snippets in results [default: false]
=item B<-as>
Search for active sites on Pfam-A matches [default: false]
=item B<-json> [I<pretty>]
Write the results in JSON format [default: false]
=item B<-cpu>
Number of parallel CPU workers to use for multithreads [default: all]
=item B<-translate> [I<mode>]
Treat the input sequence as DNA and perform a six-frame translation before
searching, using the "translate" program from the HMMER v2.3.2 package. If the
optional value I<mode> is given, it must be either "all" or "orf": "all" means
translate in full, with stops, and produce no individual ORFs; "orf" means
translate and report only ORFs of length greater than 20. If B<translate> is
used but I<mode> is omitted, the default is to translate using the "orf"
method [default: off (no translation)]
=item B<-h>
Display help message
=back
The input must be a FASTA-format file. The C<-fasta> and C<-dir> options are
mandatory. You cannot specify both an E-value and bits score threshold.
=head1 OVERVIEW
C<pfam_scan.pl> is a script for searching one or more protein sequences against the
library of HMMs from Pfam. It requires a local copy of the Pfam data files, which
can be obtained from the Pfam FTP area:
ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/
You must also have the HMMER3 binaries installed and their locations given by your
C<PATH> environment variable. You can download the HMMER3 package at:
http://hmmer.org/download.html
=head1 OUTPUT
The output format is:
<seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan> <predicted_active_site_residues>
Example output (-as option):
O65039.1 38 93 38 93 PF08246 Inhibitor_I29 Domain 1 58 58 45.9 2.8e-12 1 No_clan
O65039.1 126 342 126 342 PF00112 Peptidase_C1 Domain 1 216 216 296.0 1.1e-88 1 CL0125 predicted_active_site[150,285,307]
Most of these values are derived from the output of I<hmmscan> (see HMMER3
documentation for details). The significance value is 1 if the bit score for a
hit is greater than or equal to the curated gathering threshold for the
matching family, 0 otherwise.
=head1 REFERENCES
Active site residues are predicted using the method described in the publication:
Mistry J., Bateman A., Finn R.D. "Predicting active site residue annotations in
the Pfam database." BMC Bioinformatics. 2007;8:298. PMID:17688688.
In pfam_scan.pl version 1.6 onwards, the hmm positions of known active site residues
are stored for each family. The residues at these hmm positions are compared
to the other sequences in the same family that do not have active site annotations
(rather than comparing the residues at the same column positions in the aligment
as stated in the above publication). This gives a significant speed increase.
Also note that step 6 in Figure 1 of the publication above is no longer implemented;
where a sequence matches multiple active patterns, all patterns (but not subpatterns)
are reported, eg: predicted_active_site[150,285,307][151, 280]
=head1 AUTHORS
Jaina Mistry ([email protected]), Rob Finn ([email protected])
=cut
=head1 COPYRIGHT
Copyright (c) 2009: Genome Research Ltd.
Authors: Jaina Mistry ([email protected]), rdf ([email protected])
This is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
or see the on-line version at http://www.gnu.org/copyleft/gpl.txt
=cut