-
Notifications
You must be signed in to change notification settings - Fork 1
/
domain_tool.pl
130 lines (120 loc) · 3.65 KB
/
domain_tool.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
use FileHandle;
use Getopt::Long;
use List::Util qw[min max];
#use warnings;
my $junk = "extraction.tmp";
my $usage = "
This program requires the following flags:
-seqs <file path to the FASTA file>
-hmm <file path to the HMM>
-domain <domain score cutoff>
-protein <protein score cutoff>
-full <print full sequences (default is no)>
-extend <amount to extend domain match regions by>
Optional flag:
-tmp <override the default location for the HMM results>
";
my %domains = ();
GetOptions ( 'seqs=s' => \$seqs,
'hmm=s' => \$hmm,
'domain=s' => \$domain_cutoff,
'protein=s' => \$protein_cutoff,
'tmp=s' => \$junk,
'extend=i' => \$extend,
'full' => \$full);
if (!$hmm || !$domain_cutoff || !$protein_cutoff) {die "$usage\n";}
if (!$seqs) {die "$usage\n";}
`hmmsearch -T $protein_cutoff --domtblout $junk $hmm $seqs`;
#`hmm3search -T $protein_cutoff --domtblout $junk $hmm $seqs`; #<- JCVI alias
read_domtblout($junk);
search_fasta($seqs);
#########################################################################################################################################################################
sub read_domtblout
{
my $source = shift;
open (TABLE, "$source") || die "cannot read domtblout file\n";
while ($line = <TABLE>)
{
if (!($line =~ /^#/)) # ignore the header line
{
@split_line = split(/\s+/, $line); # splits on tabs - some entries contain whitespace
if ($split_line[13] >= $domain_cutoff)
{
push(@{$domains{$split_line[0]}[0]}, $split_line[19]); # adds "env from" coordinate to array
push(@{$domains{$split_line[0]}[1]}, $split_line[20]); # adds "env to" coordinate to array
# %domains is a hash, but $domains{identifier}[0] and $domains{$identifier}[1] are both arrays
# this way, all domains from one sequence are stored with the same hash key, but can easily be processed iteratively
}
}
}
close TABLE;
}
##########################################################################################################################################################################
sub search_fasta
{
my $fasta = shift;
open (FASTA, "$fasta") || die "cannot read sequence file\n";
my $tmp_seq = '';
while ($line = <FASTA>)
{
if ($next_one == 1)
{
$next_one = 0;
$line = $skipped_line;
}
if ($line =~ /^>(.*?)\s/)
{
if ($domains{$1})
{
$identifier = $1;
$tmp_seq = '';
if ($line =~ /(.*?)\r?\n/)
{
$header = $1;
}
else
{
$header = $line;
chomp($header);
}
while ($seq_line = <FASTA>)
{
if (!($seq_line =~ />/))
{
local $/ = "\r\n";
chomp($seq_line);
local $/ = "\n";
chomp($seq_line);
$tmp_seq = ($tmp_seq . $seq_line);
}
else
{
$next_one = 1;
$skipped_line = $seq_line;
last;
}
}
for ($i = 0; $i <= $#{$domains{$identifier}[0]}; $i++)
{
$from = ($domains{$identifier}[0][$i]-1);
$from = max(0, $from-$extend);
$to = ($domains{$identifier}[1][$i]-1);
$to = min($to+$extend, length($tmp_seq));
$length = ($to - $from + 1);
$to_1 = ($to + 1);
$from_1 = ($from + 1);
# $tmp_seq =~ /.{$from}(.{$length})/;
$sub_seq = substr $tmp_seq, $from, $length;
$sub_seq =~ s/(.{1,80})/$1\n/gs;
$tmp_seq =~ s/(.{1,80})/$1\n/gs;
if ($tmp_seq && $full) {chomp($header); print("$header\n$tmp_seq\n\n");}
elsif ($sub_seq && $length) {print("$header\/$from_1\-$to_1\n$sub_seq\n\n");}
}
}
else
{
next;
}
}
}
}