-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgffcmp_flt
executable file
·152 lines (142 loc) · 4.57 KB
/
gffcmp_flt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/perl
use strict;
use Getopt::Std;
my $usage = q/Usage:
gffcmp_flt [-n <min_samples>] [-M] [-x xloc.tab] [-c '<class_codes>'] [-S] \
combined.gtf
Filter a combined.gtf file produced by gffcompare by various criteria.
By default tranfrags with repeats (class 'r') or contained (class code 'c')
are discarded.
Options:
-n <min_samples> transcript has to be present in at least
this number of samples (num_samples attribute)
-M never filter out reference matching transcripts,
i.e. class_code "=" (to be used with -n option)
-c '<class_codes>' transfrags must have one of the given class codes
(e.g. -c jmnkou to look for novel transcripts)
-S do not discard single-exon transcripts
-x <xloc.out.tab> write a table with gathered XLOC information
xlocID, chr, strand, start, end, numTs, numRefTs,
lstRefOvlGeneNames, codes_summary, lstRefTs
/;
umask 0002;
getopts('SMho:x:n:c:') || die($usage."\n");
die($usage."\n") if $Getopt::Std::opt_h;
my $outfile=$Getopt::Std::opt_o;
my $codes=$Getopt::Std::opt_c;
my $snum=$Getopt::Std::opt_n;
my $xloc=$Getopt::Std::opt_x;
my $kS=$Getopt::Std::opt_S;
my $kM=$Getopt::Std::opt_M;
if ($outfile) {
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
select(OUTF);
}
# 0 1 2 3 4 5 6 7
my %xh; # xlocID => [chr, strand, start, end, num_tids, \%refids, \%refgenes, \%codes]
# the %codes hash is simply code=>count, $refids, %refgenes are just string sets
# --
my ($maxnum, $tcount, $ttotal);
my ($t, @td, $ne);
while (<>) {
next unless m/^chr/;
my ($tid)=(m/transcript_id "([^"]+)/);
next unless $tid;
if ($tid ne $t) { #new transcript, flush the previous one
#this MUST be a transcript line
tflush() if $t;
@td=();$ne=0;
$t=$tid;
$ttotal++;
if ($xloc) {
my @t=split(/\t/);
my ($x)=(m/\b(XLOC_\d+)/);
my ($r)=(m/\bcmp_ref "([^"]+)/);
my ($g)=(m/\bgene_name "([^"]+)/); #overlapping, otherwise it's cmp_ref_gene
my ($c)=(m/\bclass_code "([^"]+)/);
my $xd=$xh{$x};
if (!$xd) {
$xd=[$t[0], $t[6], $t[3], $t[4], 0, {}, {}, {}];
$xh{$x}=$xd;
}
$$xd[4]++;
die("Error at line: ${_}Locus $x was on ".$$xd[0]."\n") if $$xd[0] ne $t[0];
$$xd[1] = $t[6] if $t[6] ne '.' && $$xd[1] eq '.';
$$xd[2]=$t[3] if $t[3]<$$xd[2];
$$xd[3]=$t[4] if $t[4]>$$xd[3];
my ($rh, $gh, $ch)=@$xd[5..7];
$$ch{$c}++ if $c;
$$rh{$r}=1 if $r;
$$gh{$g}=1 if $g;
}
} else { #exon
#if (m/\texon\t/) {
$ne++;
s/\s*exon_number "\d+";//;
s/\s*gene_id "[^"]+";//;
}
push(@td, $_);
}
tflush() if $t;
my @class=('=','c', 'k', 'm', 'n', 'j', 'e', 'o', 's', 'x', 'i', 'y', 'p', 'r', 'u');
if ($xloc) {
open(FX, ">$xloc") || die("Error creating file $xloc !\n");
print FX join("\t", qw(xloc chr strand start end numTx numrefTx genes codes refTxs))."\n";
my @xk=sort(keys(%xh));
foreach my $k (@xk) {
my $xd=$xh{$k};
my @rs=keys(%{$$xd[5]});
my @gs=keys(%{$$xd[6]});
my $ch=$$xd[7];
my $ct=''; #table codes here
foreach my $c (@class) {
my $n=$$ch{$c};
next unless $n;
$ct.=($n>1 ? $n : '').$c;
}
$ct='.' unless $ct;
print FX join("\t", $k, @$xd[0..4], @rs>0 ? scalar(@rs):'.' ,
@gs>0? join(',', @gs) : '.', $ct, @rs>0 ? join(',', @rs) : '.')."\n";
}
close(FX);
}
print STDERR "Memory usage ".qx{ fgrep VmRSS /proc/$$/status };
print STDERR "Wrote $tcount transcripts out of $ttotal\n";
print STDERR "Maximum number of samples found for a transcript: $maxnum\n" if $maxnum;
# -- epilogue
if ($outfile) {
select(STDOUT);
close(OUTF);
}
#************ Subroutines **************
sub tflush {
if ($kM) {
my ($c)=($td[0]=~m/class_code "([^"]+)"/);
if ($c eq '=') {
if ($td[0] !~ m/\bnum_exons\b/) {
$td[0] =~ s/(num_samples "\d+";)/$1 num_exons "$ne";/;
}
$tcount++;
print join('',@td);
return; # print '=' codes if -M
}
}
return if !$kS && $ne<2; #skip single-exon unless -S
if ($snum) {
my ($n)=($td[0]=~m/num_samples "(\d+)"/);
return if $n<$snum;
$maxnum=$n if $n>$maxnum;
}
if ($codes) {
my ($c)=($td[0]=~m/class_code "([^"]+)"/);
return if index($codes, $c)<0;
} else { #always discard 'r'
my ($c)=($td[0]=~m/class_code "([^"]+)"/);
return if $c eq 'r';
}
if ($td[0] !~ m/\bnum_exons\b/) {
$td[0] =~ s/(num_samples "\d+";)/$1 num_exons "$ne";/;
}
$tcount++;
print join('',@td);
}