gffmanip

#!/usr/bin/perl
use strict;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Data::Dumper;
use Getopt::Long;
Getopt::Long::Configure ("gnu_getopt", "no_ignore_case");
my $usage = q/Usage:
 gffmanip [-v] [-o <outfile>] [-i [<attr>:]<tids.lst>] [--refs <refs.lst>] \
 [-K] [-t <track>] [-f <feature>,..] [-x <no-feature>[,..]] [-r <region> \
 [-R][-E]] [-a [feat:]<attr>=<value>[;..] [-A] [-k <attr1>[,..]] [-C|--cds]\
 [--table <attrlist..>] [-m <IDmap.tab>] [-d <regdata>] \
 [[-y|-Y] <CDSdata.tab>] [-T] <gff\/gtf-input-files..>

Filtering options:
 -Q assume input is sorted or at least grouped by genomic sequence
 -S sort GFF records by location
 -C output only those transcripts having CDS features (protein coding)
 --refs : only let pass records that are found on reference sequences
          (chromosomes) listed in the given file refs.lst
 --cds like -C but output only the CDS features of the coding transcripts
    (discarding exon features)
 -r show only transcripts overlapping any of the region intervals given in 
    <regdata> which has the format:
       [<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
 -X for -r option, only print records whose exon\/CDS segments overlap the given
    interval(s) (i.e. intron-only overlaps are not considered)
 -R for -r option, only print transcripts fully contained in any of the given 
    range interval(s)
 -i only shows transcripts whose IDs match any of the entries in the
    given file <ids.lst>; another attribute instead of ID can be used for 
    matching the list values if given as a <attr>: prefix to the file path;
    special "geneID:" prefix will apply gene ID filtering
 -a only shows transcripts which have the <attr> attribute with the value 
    <value>; multiple attribute\/value pairs can be given (semicolon delimited)
    and a record is printed if there is at least one attribute match (OR list);
    a <feature>: prefix can be used to specify which feature should be targeted
    (default: transcripts are targeted; the other option is "gene:")
 -A for the -a option, when multiple <attr>=<value> conditions are given 
    (comma delimited), all of them have to be present and matching (AND list)
 -v invert the filters above(like grep's -v): print records that do NOT match
    any of the above transcript filtering options

 -f output only the specified GFF features, given as a comma-delimited
    list of strings (e.g. -f 'mRNA,ncRNA,exon,CDS') (doesn't work with -v)
 -x discard\/ignore the specified GFF features (inverse of -f, ignores -v)
 -F output only transcript-like records having exon or CDS features
    (doesn't work with -v)

Data modification options:
 -m replace contig\/chromosome IDs with different IDs, as specified in the given
    ID mapping table (a two column table mapping the old ID to the new ID)
    NOTE: renaming takes place when records are printed, *after* all the
    other filters have been applied
 -y assign\/modify the CDS and transcription strand of transcripts 
    found in the given tab delimited file of this format:
      transcriptID	new_strand	CDSinfo
    ..where CDSinfo can be either CDSstart:CDSstop or a comma delimited
    list of CDS features, e.g.: 70139455-70139671,70140524-70140552
 -Y same as -y, but only add CDS to records not having CDS info, never 
    change existing CDS!
Genomic deletion processing:
 -d contig\/chromosome region(s) deletion: discard the transcripts overlapping
    the given genomic region(s), shifting the coordinates of all the 
    features downstream from the deleted region(s). Example deletion:
    -d NC_029267.1.R:18816715-18860512

Output options:
 --nbg discard "bare" gene records that do not parent any other features
 --tfix attempt to fix malformed transcripts with whole-transcript exons
 -j <outfname.tab> - write junction info to <outfname.tab>
 -K keep all GFF\/GTF attributes, preserve the input format
 -E for -K, discard exon\/CDS attributes (only keep transcript\/gene attrs)
 -Z merge close exons into a single exon (for intron size<4)
 -M adjust transcript boundaries to match min\/max coordinates of their exons
 -G adjust gene boundaries to match min\/max coordinates of their transcripts
 -k keep only the specified non-core GFF attributes in the output records
    (expects a comma delimited list of attribute names)
 -P preserve comments (lines starting with #)
 -T print GTF format (default is GFF3)
 -U decode url-encoded characters in attribute values and use safe replacements
 --table : instead of GTF\/GFF, output a tab delimited file with the values of the
    specified attribute(s) for each record; special pseudo-attributes (prefixed
    by @) are recognized: 
     @chr, @start, @end, @strand, @numexons, @exons, @cds, @covlen, @cdslen
    The prefix '^' can be used to specify parent attribute names to be printed
    for the current record.
    Hint: use with -F option to only show transcript-like records
 -t replace the string in the 2nd column with the given <track> text
 -V (verbose): show warnings or other potentially useful info
/;
umask 0002;
my @cmdline=($0, @ARGV);
#getopts('vCXEFRAZPMGKVUTSo:c:f:r:t:i:a:k:l:m:y:Y:d:') || die($usage."\n");
my ($outfile, $verbose, $keepCmts, $inSorted, $sort_gff, $Udecode, $CDSdatafile, 
  $CDSaddonly, $trOnly, $fltinvert, $printGTF, $out_track, $adjGeneSpan, $adjTSpan,
  $only_ifCDS, $discard_exons, $discardBareGenes, $idfile, $avflt, $featlist, $xflist, 
  $oattrlist, $minIntron, $keepAll, $discardExonAttrs, $avflt_AND, $check_range, 
  $strict_ovl, $exon_ovl, $del_range, $chrmapping, $tab_attrs, $exlen2score, 
  $exlen2attr, $minEndExon, $tFix, $jOut, $frefs
  );
GetOptions('o:s' => \$outfile, 'V'=>\$verbose, 'P'=>\$keepCmts, 'Q'=>\$inSorted, 'S'=>\$sort_gff, 
   'U'=>\$Udecode,'y:s'=>\$CDSdatafile, 'Y:s'=>\$CDSaddonly, 'F'=> \$trOnly, 'f:s'=>\$featlist, 
   'v'=>\$fltinvert, 'T'=>\$printGTF, 't:s'=>\$out_track, 'G'=>\$adjGeneSpan, 'M'=>\$adjTSpan,
   'C'=>\$only_ifCDS, 'cds'=>\$discard_exons, 'i:s'=>\$idfile, 'refs:s'=>\$frefs, 'a:s'=>\$avflt, 
   'x:s'=>\$xflist, 'k:s'=>\$oattrlist, 'Z'=>\$minIntron, 'K'=>\$keepAll, 'E'=>\$discardExonAttrs,
   'A'=>\$avflt_AND, 'r:s'=>\$check_range, 'R'=>\$strict_ovl, 'X'=>\$exon_ovl, 'tfix'=>\$tFix,
   'd:s'=>\$del_range, 'm:s'=>\$chrmapping, 'table:s'=>\$tab_attrs, 'nbg'=>\$discardBareGenes,
   'exonlen2score'=>\$exlen2score, 'exonlen2attr'=>\$exlen2attr, 'trim-exons:6'=>\$minEndExon, 
   'j:s'=>\$jOut
   ) || die($usage."\n");
   
die("${usage}Error: no input file(s) given (use '-' for stdin)\n") unless @ARGV>0;
#my $trOnly=$Getopt::Std::opt_F; #only transcript-like records

print STDERR "Command line: ".join(' ',@cmdline)."\n" if $verbose;
die("Error: use only one of -y or -Y!\n") if ($CDSdatafile && $CDSaddonly);
$CDSdatafile=$CDSaddonly if ($CDSaddonly);
my $geneflt=0; #if specific gene filtering was used with -i or -a
my %ignoredFeatures;
@ignoredFeatures{qw(intron region contig scaffold)}=(); #let's ignore these useless features
my %oattrs; #hash with attrs to print
@oattrs{qw(ID transcript_id Parent gene_id gene geneID geneName gene_name)}=();
my @tFixAttrs=qw(Name product protein_id Note);
if ($tFix) {
  @oattrs{@tFixAttrs}=();
}
my %fltrefs; # hash with chr IDs from $frefs file
my %attrflt; # attribute=>value filter
my %idflt;   # hash with IDs to keep
my %atab;    # hash with attribute names whose values will be listed as tab delimited
my %chrmap; #if -m option was provided, maps chrID => new_chrID
my %jData; # 'chr:start~end:strand => [tID1, tID2,..]
my @atab_cols; 
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
}
my %assignCDS; # tid =>  [ strand, CDSlo, CDShi, [[cdsegstart, cdsegend],..]]
if ($CDSdatafile) {     #     0    1        2            3
 open(FCDS, $CDSdatafile) || die("Error opening CDS data file $CDSdatafile\n");
 while (<FCDS>) {
   next if m/^#/;
   chomp;
   my ($tid, $strand, $cdsdata)=split(/\t/);
   die ("Error: invalid format encountered for CDS data:$_\n")
     unless ($cdsdata && ($strand eq '-' || $strand eq '+'));
   my ($cds_lo, $cds_hi);
   my @cds;
   if ($cdsdata=~m/^(\d+)\:(\d+)$/) {
      ($cds_lo, $cds_hi)=($1, $2);
   }
   else {
     @cds=map { [(split(/\-/))] } (split(/\,/, $cdsdata));
     #assumed sorted properly!
     ($cds_lo, $cds_hi)=($cds[0]->[0], $cds[-1]->[1]);
   }
   if (exists($assignCDS{$tid})) {
     print STDERR "Warning: multiple CDS (re)assignments given for $tid, only the 1st is used.\n";
     next;
   }
   $assignCDS{$tid}=[$strand, $cds_lo, $cds_hi, [@cds]];
 }
 close(FCDS);
}

# --
#my ($fltinvert, $printGTF, $out_track)=
#  ($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
#my ($adjGeneSpan, $adjTSpan)=($Getopt::Std::opt_G, $Getopt::Std::opt_M);
#my $only_ifCDS=$Getopt::Std::opt_C;
#my $discard_exons=($Getopt::Std::opt_c eq 'ds');
$only_ifCDS=1 if $discard_exons;
#my ($idfile, $avflt, $featlist, $oattrlist)=
#  ($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
die("Sorry, -v option does not work with -f !\n") if $fltinvert && $featlist;
die("Sorry, -v option does not work with -x !\n") if $fltinvert && $xflist;
die("Sorry, -v option does not work with -F !\n") if $fltinvert && $trOnly;

#my $minIntron=$Getopt::Std::opt_Z ? 5 : 1;
$minIntron= $minIntron ? 5 : 1;
#my $keepAll=$Getopt::Std::opt_K || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all');
$keepAll=1 if lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all';
#my $discardExonAttrs=$Getopt::Std::opt_E;
die("Error: -E option requires -K !\n") if ($discardExonAttrs && !$keepAll);
my $avflt_feature; #feature targeted by attr=value,.. filter
#my $avflt_AND=$Getopt::Std::opt_A; #if $avflt should require a match for ALL given attributes
#my ($check_range,        $strict_ovl,         $exon_ovl)=
#   ($Getopt::Std::opt_r, $Getopt::Std::opt_R, $Getopt::Std::opt_X);
#my $del_range=$Getopt::Std::opt_d;
#my $chrmapping=$Getopt::Std::opt_m;
if (($del_range || $chrmapping) && $fltinvert) {
   die("Error: -v option cannot be used with -m or -d options!\n");
}
my ($flt_chr, $flt_strand);
my @flt_intv;
if ($check_range) {
  ($flt_chr, $flt_strand)=parseLocations($check_range, \@flt_intv);
}
my ($del_chr, $del_strand, $del_total);
my @del_intv;
if ($del_range) {
  ($del_chr, $del_strand)=parseLocations($del_range, \@del_intv);
  die("Error: invalid deletion location provided ($del_range)\n") if @del_intv==0;
  $del_total=0;
  foreach my $dseg (@del_intv) {
    $del_total+=$$dseg[1]-$$dseg[0]+1;
  }
}

$oattrlist='' if $keepAll;
if ($tab_attrs) {
  @atab_cols=split(/\,/, $tab_attrs);
  @atab{@atab_cols}=();
  }
my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id

if ($idfile) {
  unless (-f $idfile) {
     my ($a,$f)=(split(/\:/,$idfile));
     ($idfile, $idfileattr)=($f,$a) if $f;
     if ($idfileattr eq 'geneID') {
       $idfileattr='.';
       $geneflt=1;
     }
  }
  die("Error: no such id list file: $idfile\n") unless $idfile && -f $idfile;
  my $idf;
  open($idf, $idfile) || die("Error: cannot open $idfile $!\n");
  while (<$idf>) {
    my ($id)=(m/(\S+)/);
    $idflt{$id}=1 if $id;
  }
  close($idf);
}

if ($frefs) {
  die("Error: no such ref id list file: $frefs\n") unless -f $frefs;
  open(F, $frefs) || die("Error: cannot open $frefs $!\n");
  while (<F>) {
    my ($id)=(m/^(\S+)/);
    $fltrefs{$id}=1 if length($id)>0;
  }
  close(F);
}

if ($avflt) {
  my @avl=split(/\;/, $avflt);
  foreach my $avpair (@avl) {
    my ($attr, $value)=split(/\s*=\s*/,$avpair,2);
    if ($attr=~s/^(\w+)\:(\w+)/$2/) {
      $avflt_feature=$1 unless $avflt_feature;
    }
    $value=~s/[" ]+$//;$value=~s/^[" ]+//;
    push(@{$attrflt{$attr}},$value);
    }
  $geneflt=1 if $avflt_feature eq 'gene';
  }
my %featflt; # feature list filter -- if $featlist was given
if ($featlist)  {
 #add to the list of features to be kept
 my @fl=split(/\,/, $featlist);
 @featflt{@fl}=();
}
if ($xflist)  {
 #add to the list of features to be kept
 my @fl=split(/\,/, $xflist);
 @ignoredFeatures{@fl}=();
}
if ($oattrlist) {
  #add to the list of attributes to be kept
  my @al=split(/\,/, $oattrlist);
  @oattrs{@al}=();
}

if ($chrmapping) {
 open(CMAP, $chrmapping) || die("Error: cannot open file $chrmapping \n");
 while (<CMAP>) {
   chomp;
   next if length($_)<2 || m/^#/;
   my @m=split(/[\s\,\=]+/);
   next if @m<2;
   $chrmap{$m[0]}=$m[1];
 }
 close(CMAP);
}

##--- gene tracking:
my %genes; #gene ID => [$gffrecs_Data, [$t_ID, ...], $filtered_out)
           #                0               1           2

##--- keeping track of all gene and transcript records :
my %gffrecs; # ID => [ chr, strand, feat_type,  \%attrs, fstart, fend, [@exons], [@cds], track,  geneID, fscore, xflags ]
          #             0      1        2           3      4       5       6        7        8      9       10      11
## xflags is a bit field: bit 1 = exonsProcessed; bit 2 = only CDS segments given, bit 3 = createdByExon

my @gff_IDs; #IDs of each primary records (transcripts and genes) as they are encountered
my $input_GFF3; #set to 1 if input was detected as GFF3
my $lChr=loadGff(\%gffrecs);
if ($inSorted && $lChr && keys(%gffrecs)>0) {
       print STDERR "contig $lChr loaded, now processing it..\n";
}
my $mtgs; #multi-transcript genes
if ($sort_gff) {
   my @sorted_recs=sort sortByLoc keys(%gffrecs);
   $mtgs=processGffRecs(\%gffrecs, \@sorted_recs);
}
 else {
   $mtgs=processGffRecs(\%gffrecs, \@gff_IDs);
}

print STDERR "  $mtgs multi-transcript genes found.\n" if $mtgs>0;
if ($jOut) {
  open(JOUT, '>'.$jOut) || die("Error creating file $jOut\n");
  my @sorted_js = sort sortJByLoc keys(%jData);
  foreach my $k (@sorted_js) {
    my $jd=$jData{$k};
    my @loc=split(/[\:~]/, $k);
    my @ts=sort { $a cmp $b } @$jd;
    print JOUT join("\t",@loc)."\t".join(',',@ts)."\n";
  }
  close(JOUT);
}

# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
}

#************ Subroutines **************
sub sortByLoc {
 my $da=$gffrecs{$a};
 my $db=$gffrecs{$b};
 if ($$da[0] eq $$db[0]) {
    return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
    }
  else { return $$da[0] cmp $$db[0] ; }
}

sub sortJByLoc {
 my @da=split(/[\:~]/,$a);
 my @db=split(/[\:~]/,$b);
 my $c=$da[0] cmp $db[0];
 return $c if $c;
 $c=$da[1] <=> $db[1];
 return $c if $c;
 $c=$da[2] <=> $db[2];
 return $c if $c;
 return (ord($da[3]) <=> ord($db[3]));
}

sub parseLocations {
  my ($s_range, $r_intv)=@_;
  my ($l_chr, $rlst)=split(/\:/,$s_range,2);
  #die("$usage Incorrect format for the interval list!\n") unless $l_chr && $rlst;
  my $l_strand=substr($l_chr,0,1);
  if ($l_strand eq '-' || $l_strand eq '+') {
     substr($l_chr,0,1)='';
  } else {
    $l_strand=undef; 
    my $e=substr($l_chr,-1);
    if ($e eq '-' || $e eq '+') {
        $l_strand=$e;
        chop($l_chr);
    }
    else { #try the suffix
      if ($rlst) {
        my $e=substr($rlst,-1);
        if ($e eq '-' || $e eq '+') {
           $l_strand=$e;
           chop($rlst);
        }
      }
    }
  } #no strand
  $l_strand='' if $l_strand eq '+';
  if ($rlst) {
    my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$rlst));
    foreach my $d (@rdata) {
      $$d[1]=1999999999 if length($$d[1])==0;
      ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
    }
    @$r_intv = sort { $a->[0] <=> $b->[0] } @rdata;
  }
  if ($verbose) {
    print STDERR "parsed regions: ${l_strand}$l_chr: ".
      join(', ', (map { $_->[0].'-'.$_->[1] } @$r_intv) )."\n";
  }
  return ($l_chr, $l_strand);
}


sub checkOvlSegs { # $rx must be a ref to list of segments (intervals) SORTED
 my ($a, $b, $rx)=@_;
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole list of segs
 foreach my $x (@$rx) {
   return 1 if ($a<=$$x[1] && $b>=$$x[0]);
   return 0 if $b<$$x[0];
   }
}

sub checkOvlDels {
 # $rseg is a reference to a tuple of coordinates [$a, $b]
 #       $rseg values will be updated according to the global @del_intv deletion data
 # $rx must be a ref to list of segments (intervals) SORTED
 my ($rseg, $rx)=@_; # return 1 if any overlap is found
 return 0 if ($$rseg[1]<$$rx[0]->[0]); # $rseg ends before any deletion
 my $del_acc=0;
 foreach my $x (@$rx) {
   return 1 if ($$rseg[0]<=$$x[1] && $$rseg[1]>=$$x[0]); #overlap with $x
   last if ($$x[0]>$$rseg[1]); #cannot overlap anymore
   $del_acc+=$$x[1]-$$x[0]+1;
 }
 if ($del_acc) {
    $$rseg[0]-=$del_acc;
    $$rseg[1]-=$del_acc;
    return 2;
 }
 return 0;
}

sub checkWithinExons {
 my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   return 1 if ($a>=$$x[0] && $b<=$$x[1]);
   return 0 if $b<$$x[0];
 }
}


sub cleanupExons {
 my ($er, $rex, $cr, $rcd, $tid)=@_;
 @$rex=@$er;
 @$rcd=@$cr;
 if (@$rcd>0) {
   push(@$rex, @$rcd); #merge CDS segments with UTRs, in case only UTRs were given instead of full exons
   @$rcd=sort { $a->[0] <=> $b->[0] } @$rcd;
 }
 @$rex=sort { $a->[0] <=> $b->[0] } @$rex;
 my $i=0;
 while ($i+1<@$rex) {
   # $$rev[$i] vs $$rex[$i+1]
   my $exdist=$$rex[$i+1]->[0]-$$rex[$i]->[1]; #inter-exon distance
   if ($exdist<=$minIntron) { #intron too small or overlapping exons!
       if ($verbose && $$rex[$i]->[4] eq $$rex[$i+1]->[4]) { #same type of exon
          my $CDstatus=(@$cr>0) ?' (CDS unchanged)' : '';
          $exdist--;
          if ($exdist<0) {
             print STDERR "Warning: merging overlapping ($exdist) exons $$rex[$i]->[0]-$$rex[$i]->[1], ".
               "$$rex[$i+1]->[0]-$$rex[$i+1]->[1] of ${tid}$CDstatus\n";
          } else {
             print STDERR "Warning: merging too close ($exdist) exons $$rex[$i]->[0]-$$rex[$i]->[1], ".
               "$$rex[$i+1]->[0]-$$rex[$i+1]->[1] of ${tid}$CDstatus\n";
          }
       }
       $$rex[$i]->[1]=$$rex[$i+1]->[1] if $$rex[$i+1]->[1]>$$rex[$i]->[1];
       splice(@$rex, $i+1, 1);
   }
   else { ++$i }
 }
 $i=0;
 while ($i+1<@$rcd) { #only adjacent/overlapping CDS and codons will be merged
   # $$rev[$i] vs $$rex[$i+1]
   if ($$rcd[$i+1]->[0]-$$rcd[$i]->[1]<2 &&
       ($$rcd[$i+1]->[4]=~m/codon/i  || $$rcd[$i]->[4]=~m/codon/i ) ) { 
       #adjacent or overlapping CDS with start/stop_codon feature
       if ($verbose) {
         print STDERR "Warning: merging CDS features $$rcd[$i]->[4]($$rcd[$i]->[0]-$$rcd[$i]->[1]) and ".
           "$$rcd[$i+1]->[4]($$rcd[$i+1]->[0]-$$rcd[$i+1]->[1]) of $tid\n";
       }
       $$rcd[$i]->[1]=$$rcd[$i+1]->[1] if $$rcd[$i+1]->[1]>$$rcd[$i]->[1];
       splice(@$rcd, $i+1, 1);
   }
   else { ++$i }
 }
}

#
sub rmTinyIntrons {
 my ($rex)=@_;
 my $i=0;
 while ($i+1<@$rex) {
   # $$rev[$i] vs $$rex[$i+1]
   if ($$rex[$i+1]->[0]-$$rex[$i]->[1]<4) { #intron too small or overlapping exons!
       $$rex[$i]->[1]=$$rex[$i+1]->[1] if $$rex[$i+1]->[1]>$$rex[$i]->[1];
       splice(@$rex, $i+1, 1);
   }
   else { ++$i }
 }
}


sub loadGff {
 #assumes the input file names are in @ARGV
 my ($recs)=@_; #hash of record IDs to populate (%gffrecs)
 my $lastChr;
 foreach my $fname (@ARGV) {
   my $isGFF3;
   $input_GFF3=0;
   my ($ifh, $fclose);
   if ($fname eq '-') {
     $ifh=\*STDIN;
   }
   elsif ($fname =~m/\.gzi?p?$/) {
     $ifh=new IO::Uncompress::Gunzip $fname 
            or die "Error opening gzipped file $fname : $GunzipError\n";
   }
   else {
     open($ifh, '<', $fname) || die "Error opening $fname : $!\n";
     $fclose=1;
   }
   while (<$ifh>) {
     if (m/^\s*#/) {
        push(@gff_IDs, $_) if $keepCmts && !$sort_gff;
        next;
     }
     chomp;
     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $atr)=split(/\t/);
     next if ($frefs && !exists($fltrefs{$chr}));
     if ($chr ne $lastChr) {
       if ($inSorted && $lastChr && keys(%gffrecs)>0) {
         print STDERR "contig $lastChr loaded, now processing it..\n";
         my $mtg;
         if ($sort_gff) {
           my @sorted_recs=sort sortByLoc keys(%gffrecs);
           $mtg=processGffRecs(\%gffrecs, \@sorted_recs);
         }
         else {
           $mtg=processGffRecs(\%gffrecs, \@gff_IDs);
         }
         print STDERR "  $mtg multi-transcript genes found.\n" if $mtg>0;
        %genes=();
        %gffrecs=();
        @gff_IDs=();
       }
       $lastChr=$chr;
     }
     next if $trOnly && $f eq 'locus';
     my $line=$_;
     next unless $fstart>0 && $atr;
     my $reqfeat=exists($featflt{$f});
     next if $featlist && !$reqfeat;
     next if !$reqfeat && exists($ignoredFeatures{lc($f)});
     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
     $track=$out_track if $out_track;
     my $xf;
     my ($isExon, $isCDS);
     if ($f=~m/exon/i || $f=~m/utr/i) {
        $xf='exon';
        $isExon=1;
     }
     if ($f=~m/^cds$/i || $f=~m/codon/i) {
       $xf='CDS';
       $isExon=1; # still an exon-like feature
       $isCDS=1;
     }
     #my $exonFeature=($xf eq 'exon' || $xf eq 'CDS');
     my $gff3_ID;
     my $gff3_Parent;
     my %attrs; # attribute => value associations
     #$atr=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; in string between quotes?
     my $alstref=[]; # list of attribute names, in their original order (except 'ID')
     $attrs{'.'}=$alstref; #key '.' -> [attribute names in order] 
     my @av=split(/\s*\;\s*/, $atr);
     ($gff3_ID)=($atr=~m/\bID=([^;]+)/);
     ($gff3_Parent)=($atr=~m/\bParent=([^;]+)/);
     my $isGene=($f=~m/gene$/i && $gff3_ID); 
     $isGFF3 = ($gff3_ID || $gff3_Parent);
     my @noteIDs; #for $tFix, this has MONAX IDs in the first Note= attribute found
     if ($isGFF3) { # GFF format
        $input_GFF3=1;
        #parse GFF3 attributes into %attrs
        foreach my $a (@av) {
           my ($attr, $value)=split(/\s*=\s*/,$a,2);
           next if !$keepAll && !exists($oattrs{$attr});
           $value=~s/[" ]+$//;$value=~s/^[" ]+//;
           $attrs{$attr}=$value;
           next if $tFix && $attr eq 'Note';
           push(@$alstref, $attr) unless $attr eq 'ID';
        }
        if ($tFix) {
            #very specific fix for the horribly broken MONAX annotation GFF
            my $note=$attrs{'Note'};
            if ($note) {
              $note=~s/\w+ variant%3B~//g;
              $note=~s/%3B~[a-z]+//g;
              @noteIDs = grep(/^MONAX_/, (split(/%3B~/, $note)) );
              delete $attrs{'Note'};
            }
        }
        if ($gff3_ID && !$isExon) { #top level feature (e.g. gene, transcript)
           my @recIDs;
           if ($tFix) {
               if (@noteIDs>0) { #only transcripts should have Note ID
                 die("Error: unexpected Note ID found for gene $gff3_ID!\n") if $isGene;
                 @recIDs=@noteIDs;
                 $gff3_Parent=~s/^gene\-// ;
               } else { #must be a gene
                 die("Error: no Note ID found for non-gene $f $gff3_ID!\n") if !$isGene;
                 $gff3_ID=~s/^gene\-// ;
                 @recIDs=($gff3_ID);
               }
           } else {
             @recIDs = ($gff3_ID);
           }
           foreach my $rID (@recIDs) {
             if ($tFix) {
              $attrs{'ID'}=$rID;
              $attrs{'Parent'}=$gff3_Parent if !$isGene;
             }
             my $recData=$recs->{$rID};
             if ($recData) {
                die("Error: duplicate feature $rID\n") if ($recData->[11] & 4 )== 0 ;
                #feature created by exon (should only happen for basic GTF)
                $recData->[2]=$f;
                #add each exon attribute, if novel
                #print STDERR "combining attributes for $f $rID:\n";
                #print STDERR ">recData->3: ", Dumper $recData->[3];
                #print STDERR ">attrs: ", Dumper \%attrs;
                my $xattrs=$recData->[3];
                foreach my $a (@{$xattrs->{'.'}}) {
                  #add each exon attr to the parent, if new
                  next if $a eq 'ID' || $a eq 'Parent' || exists $attrs{$a};
                  $attrs{$a}=$xattrs->{$a};
                  push(@$alstref, $a);
                }
                $recData->[3] = {%attrs};
                #print STDERR ">after combining: ", Dumper $recData->[3];
                
                
                $recData->[4] = $fstart;
                $recData->[5] = $fend;
                $recData->[10] = $fscore;
             }  else {
                push(@gff_IDs, $rID);
                #           0      1      2       3        4      5    6    7    8     9          10   11
                $recData=[$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $track, 
                                                              ($isGene ? $rID : $gff3_Parent), $fscore, 0];
                $recs->{$rID} = $recData;
             }
             if ($isGene) {
               die("Error: gene record already created for $rID\n") if exists($genes{$rID});
               $genes{$rID}=[$recData, [], 0];
             } elsif ($gff3_Parent) { #must be a transcript parented by a gene
               my $gData=$genes{$gff3_Parent};
               if ($gData) { #update existing gene entry
                 push(@{$gData->[1]}, $rID);
               }
               else { #create new %genes entry (gene record missing)
                 $genes{$gff3_Parent}=[ undef, [$rID], 0 ];
               }
             }
           } #for each transcript ID (or 1 gene ID)
           next;
        } # parent/top-level feature
     } #GFF3
     else { ## GTF
        my $gene_id;
        foreach my $a (@av) { #parse GTF attributes
           my ($attr, $value)=split(/\s+"/,$a,2); #"
           next if !$keepAll && !exists($oattrs{$attr});
           $value=~s/[" ]+$//;
           if ($attr eq 'transcript_id') {
              $attr='ID';
           } else {
             $gene_id=$value if $attr eq 'gene_id';
             push(@$alstref, $attr);
           }
           $attrs{$attr}=$value;
        }
        if ($f eq 'transcript') { # GTF with parent 'transcript' feature
          my $gffID=$attrs{'ID'};
          die("Error: cannot find transcript_id for GTF 'transcript' line:\n$line\n") unless $gffID;
          die("Error: duplicate feature $gffID\n") if (exists($recs->{$gffID}));
          push(@gff_IDs, $gffID);
          #                    0        1    2      3        4        5   6   7    8        9         10     11
          $recs->{$gffID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $track, $gene_id, $fscore, 0 ];
          if ($gene_id) {
             my $gData=$genes{$gene_id};
             if ($gData) { #update existing %genes entry
               push(@{$gData->[1]}, $gffID);
             }
             else { #create new %genes entry
              $genes{$gene_id} = [ undef, [$gffID], 0 ];
             }
          }
          next;
        } # parent 'transcript' feature in GTF
     } #GTF
     next unless $isExon;
     # ---------- only exon/CDS features from here on ------
     my @IDs; # a GFF3 exon can have multiple parents
     my $recID; #ID for the parent record 
     if ($isGFF3) {
        if ($tFix) {
          die("Error: Note IDs not found for $f $gff3_ID of $gff3_Parent !") unless @noteIDs>0;
          @IDs=@noteIDs;
        }
        else { #regular GFF3, possible multiple parents for this exon
          @IDs=split(/\s*,\s*/,$gff3_Parent);
        }
     }
     elsif ($atr=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
       $recID=$1;
       $recID=~tr/"//d; #"
       @IDs=($recID);
     }
     else {
       die("Error: cannot parse parent ID from input line:\n$line\n");
     }
     my $exattrs=''; #exon/CDS attributes
     if ($keepAll && !$discardExonAttrs) {
       #erase Parent, gene_id, transcript_id attributes
       if ($isGFF3) { 
         #$exattrs=~s/\bParent=[^;]+;?//; }
         foreach my $a (@$alstref) {
           next if $a eq 'Parent';
           my $v=$attrs{$a};
           $exattrs.="$a=$v;";
         }
       }
       else { ## -- GTF
         foreach my $a (@$alstref) {
           next if $a eq 'transcript_id' || $a eq 'gene_id';
           my $v=$attrs{$a};
           $exattrs.="$a \"$v\";";
         }
       }
     }
     foreach $recID (@IDs) {
       my $ld = $recs->{$recID};
       if ($ld) { ## existing parent (transcript/gene) entry, add this exon
         if ($tFix) { #add exon attributes to parent transcript, if they are new
           my $tattrs=$ld->[3];
           $attrs{'Parent'}=$recID if $tFix;
           #print STDERR "attempt to add exon attrs:", Dumper \%attrs;
           #print STDERR "    to transcript attrs:", Dumper $tattrs;
           foreach my $a (@$alstref) {
              #add each exon attr to the parent, if new
              next if $a eq 'ID' || $a eq 'Parent' || exists $tattrs->{$a};
              $tattrs->{$a}=$attrs{$a};
              push(@{$tattrs->{'.'}}, $a);
           }
           #print STDERR " =>  result: ", Dumper $tattrs;
         }
         my $i=($xf eq 'CDS') ? 7 : 6;
         my ($lstart, $lend)=($$ld[4], $$ld[5]);
         if ($adjTSpan) {
           if ($fstart<$lstart) {
              print STDERR "Warning: adjusting $recID start $$ld[4] to $fstart due to exon $fstart-$fend\n"
                 if $verbose;
              $$ld[4]=$fstart;
           }
           if ($fend>$lend) {
              print STDERR "Warning: adjusting $recID end $$ld[5] to $fend due to exon $fstart-$fend\n"
                 if $verbose;
              $$ld[5]=$fend;
           }
         }
         # --- exon feature storage format:  start, end, score, CDSphase, feature_name, GFF_attributes
         push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame, $f, $exattrs]);
         #                       0      1       2       3     4   5
       }
       else { #create transcript entry by exon
         # no existing parent transcript entry for this exon, first time seeing this parent ID
         # (shouldn't really happen for a proper GFF3)
         if ($isGFF3) { #this should NOT happen usually, parent should already be there
           #print STDERR "Warning: exon feature found before/without parent $recID:\n$line\n";
           $attrs{'ID'}=$recID;
           $attrs{'Parent'}=$recID if $tFix;
         }
         #else { # GTF input
         #  #die("Error: found exon/CDS entry without parent!\n$line\n") if ($isGFF3);
         #}
         
         push(@gff_IDs, $recID);
         $recs->{$recID} = ($f eq 'CDS') ? 
               [$chr, $strand, 'transcript', {%attrs}, $fstart, $fend,          [],      [[$fstart, $fend, $fscore, $frame, $f]], $track, '', '.', 4 ] :
               [$chr, $strand, 'transcript', {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame, $f]], [],               $track, $f, '.', 4 ] ;
             #   0       1      2               3        4       5          6(exons)                          7 (CDS)              8      9   10   11
       }
    } #for each parent ID 
   } #while readline
   close($ifh) if $fclose;
  } #for (@ARGV)
  return $lastChr;
}
sub assignCDS {
 my ($tid, $rstrand, $exr, $cdr, $tcds)=@_;
 my ($cstrand, $cds_lo, $cds_hi, $cdsegs)=@$tcds;
 my ($ex_lo, $ex_hi)=($$exr[0]->[0], $$exr[-1]->[1]);
 my @info;
 my $changeExisting=0;
 if ($$rstrand ne $cstrand) {
   if ($CDSaddonly && scalar(@$cdr)>0) {
      $changeExisting=1;
      push(@info, 'strand would have changed to '.$cstrand);
   } else {
      $$rstrand=$cstrand;
      push(@info, 'strand changed to '.$cstrand);
   }
 }
 #check if the intervals are compatible
 if ($cds_lo<$ex_lo || $cds_lo>$ex_hi) {
   print STDERR "Error: new CDS lower boundary ($cds_lo) outside the region of transcript $tid ($ex_lo-$ex_hi)!\n";
 }
 if ($cds_hi<$ex_lo || $cds_hi>$ex_hi) {
   print STDERR "Error: new CDS higher boundary ($cds_hi) outside the region of transcript $tid ($ex_lo-$ex_hi)!\n";
 }
 if (@$cdsegs==0) { #rebuild cdsegs from cds_lo:cds_hi
  my ($cdstarted, $cdended);
  foreach my $e (@$exr) {
     my ($cdstart, $cdsend)= ( $$e[0], $$e[1] );
     if (!$cdstarted && $cds_lo>=$$e[0] && $cds_lo<=$$e[1]) {
        $cdstart=$cds_lo;
        $cdstarted=1;
     }
     if ($cdstarted && $cds_hi>=$$e[0] && $cds_hi<=$$e[1]) {
        $cdsend=$cds_hi;
        $cdended=1;
     }
     push(@$cdsegs, [$cdstart, $cdsend]) if $cdstarted;
     last if $cdended;
  }
  die("Error mapping new CDS boundaries ($cds_lo:$cds_hi) to $tid exons!\n") unless @$cdsegs>0;
 }
 #add phase to CDS (assuming starts with 0)
 my $pcds=$cdsegs;
 my @revcds;
 if ($cstrand eq '-') {
   @revcds=reverse(@$cdsegs);
   $pcds=\@revcds;
 }
 my ($ph, $aclen)=(0,0);
 foreach my $cs (@$pcds) {
   if ($aclen) { $ph = ((3-$aclen) % 3)%3; }
   $aclen+=$$cs[1]-$$cs[0]+1;
   die("Error: unexpected CDS segment data for $tid CDS assignment!\n") if (@$cs!=2);
   push(@$cs, '.');
   push(@$cs, $ph);
   push(@$cs, 'CDS');
 }
 if (scalar(@$cdr)==0) {
   push(@info, "CDS added: $cds_lo:$cds_hi");
   @$cdr=@$cdsegs;
 }
 else { # re-assigned
   my $changed='preserved';
   my $ch;
   if (scalar(@$cdr) == scalar(@$cdsegs)) {
      for (my $i=0;$i<@$cdr;$i++) {
        if ($$cdr[$i]->[0]!=$$cdsegs[$i]->[0] || 
            $$cdr[$i]->[1]!=$$cdsegs[$i]->[1] ||
            $$cdr[$i]->[3] ne $$cdsegs[$i]->[3]) {
            $ch=1;
            last;
        }
      }
   } else { $ch=1;}
   if ($ch) {
     if ($CDSaddonly) {
       $changed='would have changed';
     } else {
       @$cdr=@$cdsegs;
       $changed='changed';
     }
   }
   push(@info, "CDS $changed: $cds_lo:$cds_hi");
 }
 if ($verbose && @info) {
   print STDERR "INFO: CDS assign for $tid: ".join('; ',@info)."\n";
 }
}

sub processGffRecs {
 #return if keys(%recs)==0;
 $printGTF=1 if (!$printGTF && !$input_GFF3);
 my ($recs, $rlist)=@_;
 my @recs_keys;
 my $mtG; #multi-transcript gene count
 unless ($rlist) {
   @recs_keys=keys(%$recs);
   $rlist=\@recs_keys;
 }
 my $lastchr; #to detect chr change
 my $dshift=0;
 foreach my $recid (@$rlist) {
   if ($keepCmts && $recid=~m/^\s*#/) {
     print $recid; #comment lines are printed as is
     next;
   }
   my $td=$$recs{$recid};
   die("Error: cannot find GFF record for $recid\n") unless $td;
   processExons($recid, $td) unless $tFix;
   #     0       1       2        3        4      5       6    7      8       9      10         11
   my ($chr, $strand, $feature, $attrs, $fstart, $fend,  $er, $cr,  $track, $geneID, $fscore, $xflags) = @$td;
   my $newchr=($chr ne $lastchr);
   if ($newchr) {
      $dshift=0;
      $lastchr=$chr;
   }
   #next if ($rej && !$fltinvert);
   #next if ($fltinvert && $featlist && !$rej);
   my $isGene=($geneID eq $recid);
   my $CDSonly = ($xflags & 2); # set to true if only CDS segments were given
   my $hasCDS=(@$cr>0);
   my @ex=@$er; #refined exon segments (in case only UTRs are given, or non-CDS exon fragments)
   my @cds=@$cr; #refined CDS segments (in case start/stop_codons are given)
   if (scalar(@cds)>0 && $attrs->{'TYPE'} eq 'non_coding') {
       #make sure that TYPE is changed from "non_coding" to "protein_coding_uncertain"
       $attrs->{'TYPE'}='protein_coding_uncertain';
       print STDERR "INFO: $recid TYPE changed to 'protein_coding_uncertain'\n" if $verbose;
   }
   # --------------
   # get the more accurate version of the start-end coords for the feature
   my $covlen=0;
   #my $gffid=$recid;
   #substr($gffid, 0, length($chr)+1)='';
   die("Error: GFF ID ($recid) not matching attrs\{ID\}(".$attrs->{'ID'}.")!\n")
       unless ($recid eq $attrs->{'ID'});
   ##---check/fix gene boundaries first!
   if ($isGene) {
     my $gdata=$genes{$recid};
     if ($gdata) { #should be there!
        my ($min, $max);
        my $numt=scalar(@{$gdata->[1]});
        #if ($numt>1 && @ex>0) {
        #    print STDERR ("Warning: gene $recid has exons and multiple transcripts ($numt)\n");
        #}
        foreach my $tid (@{$gdata->[1]}) {
          my $tdata=$$recs{$tid};
          if ($tdata) {
             processExons($tid, $tdata) unless $tFix;
             if ($tdata->[0] eq $chr) {
                $min=$tdata->[4] if $tdata->[4]<$min || !$min;
                $max=$tdata->[5] if $tdata->[5]>$max;
             }
             else {
               die("Error: gene $recid has different chromosome ($chr) than its transcript $tid ($tdata->[0])!\n");
             }
             #if ($numt>1 && @ex>0 && $tFix) {
             #  print STDERR ("Warning:    not fixing transcript $tid of gene $recid !\n");
             #  next;
             #}
             if ($tFix) {
               my @tex=@{$tdata->[6]};
               my ($tstart, $tend)=($tdata->[4], $tdata->[5]);
               if (@tex>1) {
                  for (my $i=0;$i<@tex; $i++) {
                    if ($tex[$i]->[0]==$tstart && $tex[$i]->[1]==$tend) {
                      splice(@{$tdata->[6]}, $i, 1);
                      last;
                    }
                  }
               }
             }
             
             if (0 && $tFix && (@ex>0 || @cds>0) ) { #parent gene also parents exons directly!
               my @tex=@{$tdata->[6]};
               my @tcd=@{$tdata->[7]};
               if (@tex == 1) { #single exon transcript
                   if (@cds>0 && @tcd==0) {
                     #just transfer the CDS to the exon
                     $tdata->[7]=[@cds];
                     @cds=();
                     $td->[7]=[]; #remove the CDS data from the gene
                  }
                  if (@ex>0) { #if gene has exons
                     if (@ex>1) { #completely replace transcript exons with the gene's
                        $tdata->[6]=[@ex];
                     }
                     else { #single exon transcript & gene
                      die("Warning: single exon transcript $tid not matching the single exon of gene $recid\n")
                        unless $tex[0]->[0]==$ex[0]->[0] && $tex[0]->[1]==$ex[0]->[1];
                     }
                     @ex=();
                     $td->[6]=[];
                  } #if gene has exons
               }
               elsif (@tex > 1) { #ex>0 so it's strange anyway
                 if (@tex != @ex) {
                    print STDERR "Warning: transcript $tid has a different exon count than parent gene $recid\n";
                 }
               }
             } # $tFix
          }
          else {
            die("Error: could not retrieve transcript data for $tid of gene $recid!\n")
              unless $tdata;
          }
        }#for each transcript of this gene
        if ($adjGeneSpan && $min && $max && ($min!=$fstart || $max!=$fend)) {
          print STDERR "Warning: gene $recid original span ($fstart-$fend) adjusted to ($min-$max)\n"
             if $verbose;
          ($fstart, $fend)=($min, $max);
        }
     next if ($discardBareGenes && @{$gdata->[1]}==0 && @ex==0 && @cds==0);
     $mtG++ if @{$gdata->[1]}>1;
     }
     else {
       print STDERR "Warning: no %genes entry found for $recid !\n";
     }
   } else { # not a gene - so likely a transcript
     #should be skipped if parent gene was already filtered out
     my $gdata=$genes{$geneID};
     if ($geneflt && $gdata && $gdata->[2]==1) { #gene was specifically filtered out!
       #$doprint=0;
       #that gene filter has already been inverted!
       next;
     }
   }
   next if ($trOnly && @ex==0 && @cds==0);
   my $doprint=1;
   #check all the filters
   $doprint=0 if ($only_ifCDS && $hasCDS==0);
   if ($doprint && $idfile) {
     if ($idfileattr) {
        $doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
      }
      else {
        $doprint=0 unless exists($idflt{$recid});
      }
   }
   my $check_avflt=$avflt && ((!$avflt_feature && !$isGene) || ($avflt_feature eq $feature));
   if ($doprint && $check_avflt) {
      my $avfound=0;
      if ($avflt_AND) {
        #all attributes must be present and match
        $avfound=1;
        foreach my $a (keys(%attrflt)) {
          my $v=$attrs->{$a};
          my $fvs=$attrflt{$a};
          if (!$v || !$fvs) { $avfound=0; last; }
          foreach my $fv (@$fvs) {
            if ($fv ne $v) {
              $avfound=0;
              last;
            }
          }
          last if $avfound==0;
        }
      } else {
        foreach my $a (keys(%attrflt)) {
          foreach my $fv (@{$attrflt{$a}}) {
            if ($fv eq $attrs->{$a}) {
              $avfound=1;
              last;
            }
            last if $avfound;
          }
        }
      }
      $doprint=0 if !$avfound;
   }
  
   if ($doprint && $flt_chr && $flt_chr ne $chr) {
     $doprint=0;
   }
   if ($doprint) {
     if ($flt_strand && $flt_strand ne $strand) {
       $doprint=0;
     }
   }
   if ($doprint && @flt_intv>0) {
    my $ovlmatch=0;
    if ($exon_ovl) { #t span overlap any range intervals
       for my $ed (@ex) {
         if (checkOvlSegs($$ed[0], $$ed[1], \@flt_intv)) {
            $ovlmatch=1;
            last;
            }
         }
    } elsif ($strict_ovl) {
       $ovlmatch=checkOvlWithin($fstart, $fend, \@flt_intv);
    } else {
       $ovlmatch=checkOvlSegs($fstart, $fend, \@flt_intv);
    }
    $doprint=$ovlmatch;
  }
  #inverting the filter?
  $doprint = !$doprint if $fltinvert;
  if (!$doprint) {
     if ($isGene && $geneflt) { #specific gene filtering
       my $gdata=$genes{$recid};
       $gdata->[2]=1 if $gdata;
     }
     next;
  }
  ## --check for chr sequence deletion
  if ($del_chr && $del_chr eq $chr) {
    my $fspan=[$fstart, $fend];
    my $fovldel=checkOvlDels($fspan, \@del_intv);
    if ($fovldel) {
     if ($fovldel==1) {
        print STDERR "INFO: record $recid discarded due to overlapping deleted region.\n"
          if $verbose;
        if ($isGene) {
          my $gdata=$genes{$recid};
          $gdata->[2]=1 if $gdata;
        }
        next;
     }
     ($fstart, $fend)=@$fspan;
     my $exovl; #set if exon overlap detected
     for my $ed (@ex) {
        my $xovl=checkOvlDels($ed, \@del_intv); 
        $exovl+=$xovl;
        if ($xovl==1) {
            $exovl=1;
            last;
        }
     }
     if ($exovl) {
        if ($exovl==1) {
          print STDERR "Warning: transcript $recid discarded due to exon overlaps with deleted region.\n"
            if $verbose;
          next;
        }
     }
     if (@cds>0) { #have to adjust CDS coordinates too
       for my $cd (@cds) {
          my $cdovl=checkOvlDels($cd, \@del_intv);
          print STDERR "Warning: ($recid) CDS affected by deleted regions, even though exons aren't?!\n"
              if ($cdovl && !$exovl);
          }
     }
    } #if $fovldel
  }
  #filter passed, print output
  #if ($delTinyIntrons) {
  #  rmTinyIntrons(\@ex) if @ex>1;
  #  #-- do NOT do it for CDS - because of "ribosomal slippage exception" (programmed frameshift)
  #  #rmTinyIntrons(\@cds) if @cds>1;
  #}
  
  if ($tab_attrs) {
     my @od;
     my $pck; #parent checked?
     my $pd;  # if $pcheck and it has a parent, $pd=$$recs{$parentID}
     foreach my $a (@atab_cols) {
       if (uc($a) eq 'ID') {
          push(@od, $recid);
          next;
       }
       my $fc=substr($a,0, 1);
       if ($fc eq '@') {
         my $f=substr($a,1);
         if ($f eq 'chr') {
           push(@od, $chr);
         }
         elsif ($f eq 'start') {
           push(@od, $fstart);
         }
         elsif ($f eq 'end') {
           push(@od, $fend);
         }
         elsif ($f eq 'strand') {
           push(@od, $strand);
         }
         elsif ($f eq 'numexons') {
           push(@od, scalar(@ex));
         }
         elsif ($f eq 'exons') {
          push(@od, join(',', (map { "$$_[0]-$$_[1]" } @ex)));
         }
         elsif ($f eq 'cds') {
          push(@od, @cds>0 ? join(',', (map { "$$_[0]-$$_[1]" } @cds)) : '.');
         }
         elsif ($f eq 'covlen') {
          my $cov=0;
          my $r=@ex>0?\@ex : \@cds;
          map { $cov+=($$_[1]-$$_[0]+1) } @$r;
          push(@od, $cov);
         }
         elsif ($f eq 'cdslen') {
          my $cov=0;
          map { $cov+=($$_[1]-$$_[0]+1) } @cds;
          push(@od, $cov);
         }
         next;
       }
       if ($fc eq '^') {
         my $f=substr($a,1);
         my $av='.';
         if (!$pck) {
            my $pID=$attrs->{'Parent'};
            if (length($pID)>0) {
               $pd=$$recs{$pID};
            }
            $pck=1;
         }
         if ($pd) {
           $av=${$pd->[3]}{$f};
           $av='.' if length($av)==0;
         }
         push(@od, $av);
         next;
       }
       my $av=$attrs->{$a};
       push(@od, length($av)>0 ? $av : '.');
     }
     print join("\t",@od)."\n";
     next;
  }
  #chromosome mapping (renaming) requested?
  if ($chrmapping) {
    my $cnew=$chrmap{$chr};
    $chr=$cnew if ($cnew);
  }
  
  my $tid=delete($attrs->{'ID'});
  my $tattrs;
  if ($printGTF) {
     my $pattrs;
     #do not print gene features or features without exons/CDS!
     next unless (@ex>0 || @cds>0);
     my $gene_id; #for GTF printing
     $gene_id=delete($attrs->{'gene_id'});
     if (!$gene_id) {
       my @gattrs=grep { /^gene_?id/i } @{$attrs->{'.'}};
       $gene_id=delete($attrs->{$gattrs[0]}) if @gattrs>0
     }
     if (!$gene_id) {
       my @gattrs=grep { /^gene$/i } @{$attrs->{'.'}};
       @gattrs=grep { /^gene/i } @{$attrs->{'.'}} if (@gattrs==0);
       $gene_id=delete($attrs->{$gattrs[0]}) if @gattrs>0
     }
     $pattrs='transcript_id "'.$tid.'";';
     $pattrs.=' gene_id "'.$gene_id.'";' if $gene_id;
     $tattrs=$pattrs;
     if ($keepAll) {
        foreach my $attr (@{$attrs->{'.'}}) {
           my $val=$attrs->{$attr};
           if ($Udecode) {
             $val=gff3_decode($val);
             $val=gff3_fixspecial($val, 1);
           }
           $pattrs.=' '.$attr.' "'.$val.'";' if $val;
        }
     }
     else { # only a subset of attributes will be shown
        foreach my $attr (@{$attrs->{'.'}}) {
           next unless exists($oattrs{$attr});
           my $val=$attrs->{$attr};
           if ($Udecode) {
             $val=gff3_decode($val);
             $val=gff3_fixspecial($val, 1);
           }
           $pattrs.=' '.$attr.' "'.$val.'";' if $val;
        }
     }
    print join("\t",$chr, $track, 'transcript', $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
  } else { #print GFF3
     my $pattrs="ID=$tid";
     #$pattrs.='gene_name='.$gene_name.';' if $gene_name;
     #$pattrs.='gene='.$gene.';' if $gene;
     #$pattrs.='locus='.$locus.';' if $locus;
     if ($keepAll) {
        foreach my $attr (@{$attrs->{'.'}}) {
           my $val=$attrs->{$attr};
           if ($Udecode) {
             $val=gff3_decode($val);
             $val=gff3_fixspecial($val);
           }
           $pattrs.=";$attr=$val" if $val;
           }
     }
     else { # only a subset of attributes will be shown
        foreach my $attr (@{$attrs->{'.'}}) {
           next unless exists($oattrs{$attr});
           my $val=$attrs->{$attr};
           if ($Udecode) {
             $val=gff3_decode($val);
             $val=gff3_fixspecial($val);
           }
           $pattrs.=";$attr=$val" if $val;
        }
     }
     print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
     $tattrs='Parent='.$tid;
  }
  { local $/=';'; chomp($tattrs); }
  if ($jOut) {
     my ($is, $ie)=(0,0);
     foreach my $ed (@ex) {
       $ie=$$ed[0]-1;
       if ($is) {
         #store this intron (junction) info
         my $jn=join(':',$chr, $is.'~'.$ie, $strand);
         my $ts=$jData{$jn};
         if (!$ts) {
            $ts=[];
            $jData{$jn}=$ts;
         }
         push(@$ts, $tid);
       }
       $is=$$ed[1]+1;
     }
  }
  if ($CDSonly==0) { #write exons only when they're also found in the input
        foreach my $ed (@ex) {
            my $xattrs=$tattrs;
            $xattrs.=';'.$$ed[5] if $$ed[5];
            $xattrs.=';'.sprintf( ($printGTF?'exonlen "%d"':'exonlen=%d'), $$ed[1]-$$ed[0]+1) if $exlen2attr;
            my $exscore=$$ed[2];
            if ($exlen2score) {
              $exscore= ($exscore>10 && $exscore<=100) ? sprintf('%.1f', $exscore*($$ed[1]-$$ed[0]+1)/100.0) : ($$ed[1]-$$ed[0]+1) ;
            }
            print join("\t",$chr, $track, 'exon', $$ed[0], $$ed[1], $exscore, $strand, $$ed[3], $xattrs)."\n";
        }
  }
  foreach my $cd (@cds) {
        my $xattrs=$tattrs;
        $xattrs.=';'.$$cd[5] if $$cd[5];
        $xattrs.=';'.sprintf( ($printGTF?'exonlen "%d"':'exonlen=%d'), $$cd[1]-$$cd[0]+1) if $exlen2attr;
        my $exscore=$$cd[2];
        if ($exlen2score) {
          $exscore= ($exscore>10 && $exscore<=100) ? sprintf('%.1f', $exscore*($$cd[1]-$$cd[0]+1)/100.0) : ($$cd[1]-$$cd[0]+1) ;
        }
        print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $exscore, $strand, $$cd[3], $xattrs)."\n";
  }
 } #for each stored transcript
 return $mtG;
}

sub processExons {
   my ($tid, $td)=@_;
   return if ($$td[11] & 1); #already processed
   #     0       1       2        3        4      5       6    7      8       9      10        11
   my ($chr, $strand, $feature, $attrs, $fstart, $fend,  $er, $cr,  $track, $geneID, $fscore, $xflags) = @$td;
   if (@$er==0 && @$cr==0) {
     $td->[11]|=1;
     return;
   }
   my $isGene=($geneID eq $tid);
   my $hasCDS=(@$cr>0);
   my @ex; #refined exon segments (in case only UTRs are given, or non-CDS exon fragments)
   my @cds; #refined CDS segments (in case start/stop_codons are given)
   my $CDSonly=0;
   my $numExons=scalar(@$er); #initial exon count
   $CDSonly=1 if (@$er==0 && @$cr>0); #no exon entries, just CDS
   if ($discard_exons) {
     @cds=sort { $a->[0] <=> $b->[0] } @$cr;
     $CDSonly=1;
     #-- also silently adjust transcript boundaries to just the CDS region
     ($fstart, $fend)=($cds[0]->[0], $cds[-1]->[1]);
   } else {
     cleanupExons($er, \@ex, $cr, \@cds, $tid); #check for merging exon/UTR/CDS/start_codon/stop_codon into exon/CDS segments
   }
   if ($isGene && $numExons!=scalar(@$er)) {
       print STDERR "Warning: gene $tid had problematic exons.\n";
   }
   if ($minEndExon) {
     #remove short terminal exons!
     my $nexons=scalar(@ex);
     shift(@ex) while (@ex>0 && $ex[0]->[1]-$ex[0]->[0]<=$minEndExon);
     pop(@ex) while (@ex>0 && $ex[-1]->[1]-$ex[-1]->[0]<=$minEndExon);
     print STDERR "Warning: short terminal exon(s) removed for $tid\n"
               if $verbose && @ex<$nexons;
   }
   #---- for transcripts, check for new CDS/strand assignment
   if ($CDSdatafile && @ex>0) { # $discard_exons is incompatible with -y option
      my $tcds=$assignCDS{$tid};
      assignCDS($tid, \$strand, \@ex, \@cds, $tcds) #modifies $strand and @cds as needed!
         if ($tcds);
   }
   if ($adjTSpan && @ex>0) {
     if ($fstart!=$ex[0]->[0]) {
       $fstart=$ex[0]->[0];
       print STDERR "Warning: start coordinate adjusted to exon boundary for $tid\n"
         if $verbose;
     }
     if ($fend!=$ex[-1]->[1]) {
       $fend=$ex[-1]->[1];
       print STDERR "Warning: end coordinate adjusted to exon boundary for $tid\n"
         if $verbose;
     }
   }
   $xflags|=1;
   $xflags|=2 if $CDSonly;
   #       0      1         2        3       4       5      6      7       8        9        10       11
   @$td=($chr, $strand, $feature, $attrs, $fstart, $fend,  \@ex, \@cds,  $track, $geneID, $fscore, $xflags );
}


sub gff3_encode {
 my ($v) = @_;
 $v =~ s/([^A-Za-z0-9])/sprintf("%%%2.2X", ord($1))/ge;
 return $v;
}

sub gff3_fixspecial {
 my ($v, $GTF) = @_;
 $v=~tr/\x00-\x1F\x7f/ /;
 $v=~tr/;,&/| _/;
 $v=~s/\s*=\s*/:/g;
 $v=~s/\s*\%/ prc./g;
 $v=~tr/"/'/ if $GTF; #"protect for GTF
 return $v;
}

sub gff3_decode {
 #actually decodes any % encoded characters
 my ($v) = @_;
 #$v =~ s/\+/ /g; #url decoding would also need this
 $v =~ s/%(..)/pack('c',hex($1))/ge;
 return $v;
}