gff2tbl

#!/usr/bin/perl
use strict;
use Getopt::Std;
#use FindBin;use lib $FindBin::Bin;
##
##

use Fcntl qw(SEEK_SET SEEK_CUR SEEK_END);

my $usage = q/Usage:
 gff2tbl -f <fasta_file> [-m <id2acc_map>] [-t <track>] [-o <out_prefix>]
    [-c <lab_prefix> ] [-b dbname] [-L] [-x <exclude.lst>] [-G] [-n <organism_name>] <gff_file>

 Convert GFF to .fsa and .tbl files to be used by tbl2asn

 For -m option, <id2acc_map> is a 2-column file mapping the genomic sequence IDs
 from the input gff and fasta_file (1st column) to GenBank accessions (2nd column)
 Other options:
    -L   :  use locus attribute for transcripts in order to identify a "gene"
            and its boundaries
    -G   : also output <out_prefix>.gff for table2asn
/;
umask 0002;
getopts('LGDn:b:c:x:m:f:t:o:') || die($usage."\n");
my %stop=('TAA'=>1,'TAG'=>1,'TGA'=>1, 'TAR'=>1,'TRA'=>1);
my %mitostop=('TAA'=>1,'TAG'=>1,'AGA'=>1, 'AGG'=>1);
my %ncRNAclass=('antisense_RNA'=>1, 
                 'RNase_P_RNA' =>1, 
                 'telomerase_RNA'=>1,
                 'lncRNA'=> 1,
                 'RNase_MRP_RNA'=>1,
                 'scRNA'=>1,
                 'siRNA'=>1,
                 'miRNA'=>1,
                 'snoRNA'=>1,
                 'snRNA'=>1,
                 'SRP_RNA'=>1,
                 'vault_RNA'=>1,
                 'pre_miRNA'=>1
                 );
my $fastafile=$Getopt::Std::opt_f || die("$usage A fasta file is required!\n");
my $gffout=$Getopt::Std::opt_G; 
my $defpre;
($defpre)=($fastafile=~m{([^/]+)$});
$defpre=~s/\.\w+$//;
my @tbl;
my $orgcode=$Getopt::Std::opt_c || 'jhhlp';
my $dbname=$Getopt::Std::opt_b || $orgcode.'CGS';
my $debug=$Getopt::Std::opt_D;
my $useLocus=$Getopt::Std::opt_L;
my $idmapfile=$Getopt::Std::opt_m;
my $xfile=$Getopt::Std::opt_x;
my %xclude;
if ($xfile) {
 die("Error: file $xfile not found!\n") unless -f $xfile;
 open(XF, $xfile) || die("Error opening file $xfile\n");
 while(<XF>) {
   chomp;
   my @split=split(/\|/);
   $xclude{$split[-1]}=1;
 }
 close(XF);
}
my $fileprefix=$Getopt::Std::opt_o || $defpre;
my $orgname=$Getopt::Std::opt_n;
#my $track=$Getopt::Std::opt_t || 'jigsaw';
# die("$usage A feature track must be specified!\n");
my $gff=shift(@ARGV) || die($usage."Error:An input gff3 file must be given!\n");
die("$gff: nothing to do.") if (-e $gff && (-s $gff<10));
my %idmap;
if ($idmapfile) {
 open(IDMAP, $idmapfile) || die ("Error opening $idmapfile!\n");
 while (<IDMAP>) {
   chomp;
   my @a=split;
   #$idmap{$a[0]}=$a[1];
   $idmap{$a[0]}="gnl|WGS:".substr($a[1], 0,4)."|$a[0]|gb|$a[1]";
 }
 close(IDMAP);
}
my @seqlst; #list of seqids
my %seqtbl; #seqid => [ file_offset, seq_len, file_len, [@genetbl for seq] ]

my %gn; #unique gene names
             #             0       1      2     3   4                  5
my %genetbl; # geneid => [chr, strand, start, end, [@ttbl for gene], $gid]

          #                    0      1       2     3    4        5
my %ttbl; # transcript_id => [chr, strand, start, end, geneID, [product, simacc, psim, qcov, eval],
          #                               [@exons], [@CDS],   $t_id, $partialness, RNA_Type,  biotype]
          #                                   6       7         8        9           10        11
my ($seqid, $ntseq, $ntlen); #current genomic seq data

open(FSA, '>'.$fileprefix.'.fsa') 
  || die ("Error creating file $fileprefix.fsa !\n");

open(FA, $fastafile)||die("Error opening $fastafile !\n");

{
  my $fsa_offset=0;
  $ntlen=0;
  my $cnum;
  while (<FA>) {
    if (m/^>(\S+)\s*(.*)/) {
      my ($sid, $rest)=($1, $2);
      if ($seqid && $ntlen>0) {
        #basic consistency check
        if (exists($seqtbl{$seqid})) {
           die("Error: duplicate genomic sequence $seqid encountered!\n");
        }
        $fsa_offset=writeFastaRecord($seqid, \$ntseq, $ntlen, $fsa_offset); #write to FSA
        $ntseq='';
        $ntlen=0;
        $seqid='';
      }
      $seqid=$sid;
      if ($idmapfile) {
        my $newid=$idmap{$seqid};
        if ($newid) {
           $seqid=$newid;
        }
        #else {
        # print STDERR "Warning: no new ID found for existing $seqid!\n";
        #}
      }
      push(@seqlst, $seqid);
      next;
    } # FASTA header (new seqid)
    #sequence line
    tr/\n\r\t //d;
    my $llen=length($_);
    next if $llen==0; #skip empty lines
    $ntseq.=uc($_); #includes newlines
    $ntlen+=$llen;
  } #while <FA>
  if (length($seqid)>0 && $ntlen>0) {
    #basic consistency check
    if (exists($seqtbl{$seqid})) {
     die("Error: duplicate genomic sequence $seqid encountered!\n");
    }
    $fsa_offset=writeFastaRecord($seqid, \$ntseq, $ntlen, $fsa_offset);
  }
}
close(FSA);

open(GFF, $gff) || die("Error opening the input gff file: $gff !\n");
#print FTBL join("\t", '', '', '','note', $credits)."\n";

{
my @cx;   #exon intervals
my @cds;  #CDS intervals
my ($cdescr, $gene_name, $mcov, $pxgap, $pnostart, $pnostop, $qxgap);
while (<GFF>) {
  next if m/^\s*#/;
  chomp;
  my ($chr, $ftrack, $ftype, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
  if ($idmapfile) {
     my $xchr=$idmap{$chr};
     if ($xchr) {
       $chr=$xchr;
     }
     else {
       print STDERR "Warning: no GB ID found for contig $chr!\n";
     }
     #$chr="gnl|WGS:".substr($xchr, 0,4)."|$chr|gb|$xchr" if $xchr;
  }
  ##next unless lc($track) eq lc($ftrack); #"maker" ?
  ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
  #$f='exon' if ($f =~m/exon$/);
  my ($fid)=($attrs=~m/\bID\s*=\s*"?([^;"]+)/);
  my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/);
  unless ($fid || $fp) {
    print STDERR "Warning: feature line $_ has neither ID nor Parent! skipping..\n";
    next;
  }
  next if $fid && $ftype =~m/gene$/; # skip genes for now! (no pseudogenes etc.)
  if ($xfile) {
     next if ($fid && exists($xclude{$fid}));
     next if ($fp && exists($xclude{$fp}));
  }
  # $ftype='mRNA' if $ftype =~m/transcript$/i;
  if ($ftype=~m/RNA$/i) { # transcript feature
    my $geneID;
    my ($simprod)=($attrs=~m/protSim\s*=\s*"?([^;"]+)/);
    if (!$simprod) {
      ($simprod)=($attrs=~m/\bproduct\s*=\s*"?([^;"]+)/i);
      if (!$simprod) {
        ($simprod)=($attrs=~m/\bdescription\s*=\s*"?([^;"]+)/);
      }
    }
    my ($biotype)=($attrs=~m/\bbiotype\s*=\s*"?([^;"]+)/);
    if ($biotype && $ftype ne 'mRNA' && exists($ncRNAclass{$biotype})) {
      $ftype='ncRNA';
    }
    if ($simprod) {
      my @multi=split(/\%3B/, $simprod);
      $simprod=$multi[0];
      $simprod=gff_decode($simprod);
      $simprod=lcfirst($simprod) if $simprod=~m/^[A-Z][a-z]/;
    }
    my ($simacc)=($attrs=~m/protSimAcc\s*=\s*"?([^;"]+)/);
    my ($simdata)=($attrs=~m/protSimData\s*=\s*"?([^;"]+)/);
    my ($partialness)=($attrs=~m/partialness\s*=\s*"?([^;"]+)/);
    my ($eval, $psim, $qcov);
    if ($simdata) {
       ($eval)=($simdata=~m/val:([\d\.\-\+]+)/);
       ($psim)=($simdata=~m/psim:([\d\.\%]+)/);
       ($qcov)=($simdata=~m/cov:([\d\.\%]+)/);
    }
    if (!$psim) {
       ($psim)=($attrs=~m/identity\s*=\s*([\d\.\%]+)/);
    }
    if (!$qcov) {
      ($qcov)=($attrs=~m/coverage\s*=\s*([\d\.\%]+)/);
      if (!$qcov) {
        ($qcov)=($attrs=~m/prr=(\w+)/);
        if ($qcov eq 'low') { $qcov=45; }
        elsif ($qcov eq 'poor') { $qcov=25; }
      }
    }
    if ($useLocus && $attrs=~m/locus\s*=\s*([^;]+)/) {
      $fp=$1;
    }
    
    if ($fp) {
      $geneID=$fp;
    } else {
      ($geneID)=($attrs=~m/geneID\s*=\s*"?([^;"]+)/);
    }
    #$geneID=$fid if (!$geneID && $ftype !~ '/RNA$/');
    die("Error: could not find a gene ID for transcript $fid\n") unless $geneID;
    #            0       1        2      3      4          
    my $tdata=[$chr, $strand, $fstart, $fend, $geneID, 
                  [$simprod, $simacc, $psim, $qcov, $eval], [], [], $fid, 0, $ftype, $biotype];
                  #  5                                      6   7    8    9    10      11
    ## store the transcript
    die("Error: transcript $fid already added!\n")
     if (exists($ttbl{$fid}));
    $ttbl{$fid}=$tdata;
    addToGene($geneID, $fid, $tdata, $chr, $strand, $fstart, $fend);
    next;
  } #transcript feature
  #--- non-transcript features
  if ($ftype eq 'exon' || $ftype eq 'CDS') { #only these are recognized! 
    storeFeature($ftype, $fp, $chr, $strand, $fstart, $fend);
  }
} #while <GFF>

} #input parsing block
close(GFF);

my %gdup;

#my $credits='Predicted annotation generated by mapping Oryza sativa RNAs with GMAP '.
#  ' by Geo Pertea at the Center for '.
#  ' Computational Biology, Johns Hopkins University';

open(FTBL, '>'.$fileprefix.'.tbl') 
  || die("Error creating $fileprefix.tbl file!\n");
if ($gffout) {
  open(GFFOUT, '>'.$fileprefix.'.gff') 
  || die("Error creating $fileprefix.gff file!\n");
}
#open(GFFOUT, ">$fileprefix.tbl.gff3") 
#   || die("Error creating $fileprefix.tbl.gff3 file!\n");
my $re;
$re = qr/\s*\(([^\)\(]+|(??{$re}))+\)/x;
my $gcounter; #global gene counter
foreach $seqid (@seqlst) {
  my ($falen, $gtbl, $fsa_offset);
  my $sd=$seqtbl{$seqid};
  die("Error: could not retrieve seqtbl entry for $seqid!\n") unless $sd;
  ($fsa_offset, $ntlen, $falen, $gtbl)=@$sd;
  if (@$gtbl>0) {
    open(FA, $fileprefix.'.fsa') || die("Error opening $fileprefix.fsa!\n");
    seek(FA, $fsa_offset, SEEK_SET);
    read(FA, $ntseq, $falen); #read contig in memory
    $ntseq=~tr/\n\r//d; #clean it up
    my $len=length($ntseq);
    close(FA);
    die("Error: $seqid seq len mismatch: $len vs $ntlen\n") if $len!=$ntlen;
    print FTBL ">Feature\t$seqid\n";
    my @srtd=sort { $main::a->[2]<=>$main::b->[2] || $main::a->[3]<=>$main::b->[3] } @$gtbl;
    writeGenes(\@srtd, \$gcounter);
  }
}
close(FTBL);
close(GFFOUT) if $gffout;

###================ subroutines =================###
sub writeFastaRecord {
  my ($id, $rseq, $slen, $foffset)=@_;
  my $fahead='>'.$id;
  my ($cnum)=($id=~m/chr_?(\d+)/);
  $fahead.=" [organism=$orgname]" if $orgname;
  $fahead.=" [chromosome=1]" if $cnum;
  $fahead.=" [moltype=DNA] [tech=wgs]";
  #$fahead.=" [gcode=1] [primary=$seqid]";
  #fahead.= " $rest" if $rest;
  $fahead.="\n";
  print FSA $fahead;
  $foffset+=length($fahead);
  my $faseq=fastafmt($rseq, $slen);
  $seqtbl{$seqid}=[$foffset, $slen, length($faseq), [] ];
  print FSA $faseq;
  $foffset+=length($faseq);
  return $foffset;
}


sub fastafmt {
 my ($s, $slen)=@_;
 #my $slen=length($$s);
 my @lines=unpack("A100" x (int(($slen-1)/100)+1),$$s);
 return join("\n",@lines)."\n";
}

sub storeFeature { #add exon or CDS feature to a stored transcript
  my ($ftype, $tid, $chr, $strand, $start, $end)=@_;
  my $idx= ($ftype eq 'CDS') ? 7 : 6;
  #$gn{$gene_name}++ if $gene_name;
  my $tdata=$ttbl{$tid};
  die("Error: transcript $tid not stored in ttbl hash!\n") unless $tdata;
  die("Error: chr/strand of exon $start-$end mismatch for $tid!\n") 
       unless $chr eq $$tdata[0] && $strand eq $$tdata[1];
  push(@{$tdata->[$idx]}, [$start, $end]);
}

sub addToGene {
 my ($gid, $tid, $tdata, $chr, $strand, $tstart, $tend)=@_;
 my $gd=$genetbl{$gid};
 if ($gd) {
   my ($gchr, $gstrand, $gstart, $gend, $gts, $gene_id)=@$gd;
   die("Error: mismatch for chr and/or strand: tid=$tid ($chr $strand), geneid=$gid ($gchr $gstrand)\n")
       if ($gchr ne $chr || $gstrand ne $strand);
   die("Error: no matching geneID when retrieving gene data for tid=$tid (geneid=$gid vs $gene_id)\n")
       if ($gid ne $gene_id);
   #push(@$gts, $tid);
   push(@$gts, $tdata);
   $gstart=$tstart if $tstart<$gstart;
   $gend=$tend if $tend>$gend;
   $genetbl{$gid}=[$gchr, $gstrand, $gstart, $gend, $gts, $gid];
 }
 else { #first time seeing this geneID
   my $gdata=[$chr, $strand, $tstart, $tend, [ $tdata ], $gid ];
   $genetbl{$gid}=$gdata;
   ## also add to the table of genes for this genomic seq
   my $sd=$seqtbl{$chr};
   die("Error: genomic sequence $chr not provided?!?\n") unless $sd && @$sd==4;
   my $gtbl=$$sd[3];
   push(@$gtbl, $gdata);
 }
}

sub setupTranscripts {
 my ($rtbl, $locid)=@_; 
 my $gpp=0; #gene partialness: 0b01: lacking STOP codon | 0b10: lacking start codon;
 my ($minl, $maxr, $hasCoding);
 ## must return: ($gpp, $minl, $maxr, $hasCoding)
 foreach my $tbl (@$rtbl) {
    #     0       1        2     3     4         5        6      7      8    9    10         11
    my ($chr, $strand, $start, $end, $geneID, $simdata, $exons, $CDS, $tid, $pp, $rnaType, $biotype)=@$tbl;
    #make sure exon and CDS segments are properly sorted
    my @exsrt=sort { $main::a->[0]<=>$main::b->[0] || $main::a->[1]<=>$main::b->[1] } @$exons;
    $exons=[@exsrt];
    $minl=$$exons[0]->[0] if ($minl==0 || $$exons[0]->[0]<$minl);
    $maxr=$$exons[-1]->[1] if ($maxr==0 || $$exons[-1]->[1]>$maxr);
    $tbl->[6]=$exons;
    if ($CDS && @$CDS>0) {
      my @cdsrt=sort { $main::a->[0]<=>$main::b->[0] || $main::a->[1]<=>$main::b->[1] } @$CDS;
      $CDS=[@cdsrt];
      $tbl->[7]=$CDS;
      my $okCDS=1;
      #NCBI requires introns at least 10 nt long
      for (my $i=1;$i<@$CDS;$i++) {
          if ($$CDS[$i]->[0]-$$CDS[$i-1]->[1]<11) {
            $okCDS=0;
            last;
          }
      }
      my $s;
      if ($okCDS) {
        map { $s.=substr($ntseq, $_->[0]-1, $_->[1]-$_->[0]+1) } @$CDS;
        $okCDS=0 if length($s)<15; #too short, we want at least 5aa
      }
      
      if ($okCDS) { #Note: we're dropping any CDS shorter than 5aa !
        $s=reverseComplement($s) if ($strand eq '-');
        my @codons=unpack('(A3)*',$s);
        ###special case fix:
        if ($chr=~m/\bmito/) {
          ## mitochondrial STOP codons are different
          #if ($locid eq 'jhhlp_008894' || $locid eq 'jhhlp_008894') {
          #  print STDERR "$chr : $locid : ".$codons[-1]."\n";
          #}
          if (!exists($mitostop{$codons[-1]})) {
               $pp|=1;
               print STDERR "Warning: stop codon not found for $locid\n";
            }
        }
        else {
          $pp|=1 if !exists($stop{$codons[-1]});
        }
        $pp|=2 if $codons[0] ne 'ATG';
        $gpp|=$pp;
        $tbl->[9]=$pp;
      } else {
        #aminoacid too short!
        $CDS=[];
        $tbl->[7]=$CDS;
      }
      $hasCoding=1 if $okCDS;
    }
    else { #non-coding RNAs
     if ($start<10) {
       $start=1;
       $$exons[0]->[0]=1;
       $minl=1 if $minl>1;
       if ($strand eq '-') {$pp|=1;}
       else { $pp|=2; }
     }
     if ($ntlen-$end<10) {
       $end=$ntlen;
       $$exons[-1]->[1]=$ntlen;
       $maxr=$ntlen if $maxr<$ntlen;
       if ($strand eq '-') {$pp|=2;}
       else { $pp|=1; }
     }
     $gpp|=$pp;
     $tbl->[9]=$pp;
   }
 }
 return ($gpp, $minl, $maxr, $hasCoding);
}

sub writeGenes {
  my ($gtbl, $rcounter)=@_;
  foreach my $gd (@$gtbl) {
    my ($chr, $strand, $start, $end, $rtbl, $gid)=@$gd;
    ++$$rcounter;
    my $locid=$orgcode.'_'.sprintf("%06d", $$rcounter);
    if (!exists($xclude{$locid})) {
      my ($gpp, $minl, $maxr, $hasCoding)=setupTranscripts($rtbl, $locid); #updates partialness, product info
      ($start, $end)=($minl, $maxr);
      ($start, $end)=($end, $start) if ($strand eq '-'); 
      $start='<'.$start if ($gpp & 2);
      $end='>'.$end if ($gpp & 1);
      #$gpp must already be reversed
      print FTBL "$start\t$end\tgene\n";
      print FTBL "\t\t\tlocus_tag\t$locid\n";
      writeTranscripts($rtbl, $locid, $hasCoding);
    }
  }
}

sub writeExons {
 my ($strand, $rsegs, $pp, $ftype)=@_;
 my @segs=@$rsegs;
 my $len=0;
 foreach my $v (@segs) {
   $len += $v->[1]-$v->[0]+1;
 }
 if ($strand eq '-') {
   map { $_ = [$_->[1], $_->[0]] } @segs;
   @segs=reverse(@segs);
 }
 $segs[0]->[0] = '<'.$segs[0]->[0] if ($pp & 2); #no START codon
 $segs[-1]->[1] = '>'.$segs[-1]->[1] if ($pp & 1); #no STOP codon
 my $p=0;
 foreach my $s (@segs) {
  if ($p) {
    print FTBL join("\t",$s->[0], $s->[1])."\n";
  }
  else {
    print FTBL join("\t",$s->[0], $s->[1], $ftype)."\n";
  }
  ++$p;
 }
 return $len;
}

sub gff_decode {
 #actually decodes any % encoded characters
 my ($v) = @_;
 #$v =~ s/\+/ /g; #url decoding would also need this
 $v =~ s/%(..)/pack('c',hex($1))/ge;
 return $v;
}

sub writeTranscripts {
  my ($rtbl, $locid, $hasCoding)=@_;
  my $tnum=0;
  my $tcount=scalar(@$rtbl);
  foreach my $tbl (@$rtbl) {
    $tnum++;
    my ($chr, $strand, $start, $end, $geneID, $simdata, $exons, $CDS, $tid, $pp, $rnaType, $biotype)=@$tbl;
    my ($simprod, $simacc, $psim, $qcov, $eval)=@$simdata;
    ($start, $end)=($end, $start) if $strand eq '-';
    my $hasCDS= (@$CDS>0);
    my ($ncRNA_class, $lnc);
    if ($rnaType=~m/lnc_?RNA/) {
      $rnaType = 'ncRNA';
      $lnc=1;
    }

    if ($rnaType eq 'mRNA' && !$hasCDS) {
        $rnaType='misc_RNA'; # FIXME: should it be 'transcript' if $hasCoding ?
    }

    my $tlen=writeExons($strand, $exons, $pp, $rnaType); #uses global $ntseq
    my $simid=$tid;
    if ($simprod) {
        if ($simprod=~s/\.?\s*\(([\w\-\.]+)\)\s*$//) {
          $simid=$1;
        }
        $simprod=~s/\s*\(EC[ :]*\d+\.[\-\d]+\.[\-\d]+\.[\-\d]+\).*$//;
        if ($simprod=~s/^\s*similar to\s+//i) {
          $simprod.=' homolog' unless $simprod=~m/homolog/i;
        }
        $simprod=~tr/ / /s;
    }
    if ($hasCDS) {
      print FTBL "\t\t\tproduct\thypothetical protein\n";
    }
    else { # lacks CDS
      my $prod="putative $rnaType";
      if ($simprod && $prod !~ m/protein/) {
         $prod=$simprod;
         $prod=~s/^(predicted|putative)\s*//;
         if ($biotype eq 'tRNA') {
           $prod=~s/^(tRNA\-\w\w\w) for anticodon \w\w\w/$1/;
         }
         #if ($prod =~m/^hypothetical / && $prod =~m/protein/) {
         #  $prod="putative $rnaType";
         #}
         #$prod="putative $simprod";
         #$prod=$simprod if ($simprod=~m/putative|predict/i);
      }
      if ($rnaType eq 'rRNA') {
          $prod=~s/\brRNA/ribosomal RNA/;
          $prod=~s/SSU/small subunit/;
          $prod=~s/LSU/large subunit/;
          $prod=~s/^\s*eukaryotic\s+//i;
      }
      if ($rnaType eq 'mRNA' || $rnaType eq 'transcript' || $rnaType eq ' misc_RNA') {
         $prod='putative transcript';
      }
      else {
       if ($rnaType eq 'ncRNA') {
            if (exists($ncRNAclass{$biotype})) {
               $ncRNA_class=$biotype;
            }
            elsif ($prod=~m/telomerase/ || $biotype=~m/telomerase/) {
               $ncRNA_class='telomerase_RNA';
            }
            elsif ($lnc && $tlen>200) {
                $ncRNA_class='lncRNA';
            }
            #print FTBL ">>>DEBUG: $tid : biotype=$biotype : class=$ncRNA_class\n";
            if (length($ncRNA_class)==0) {
               $ncRNA_class='other';
            }
            print FTBL "\t\t\tncRNA_class\t$ncRNA_class\n";
       }
      }
      if ($prod) {
        $prod=~s/^\s*similar to\s+//i;
        $prod=~s/ homolog$//i;
        print FTBL "\t\t\tproduct\t$prod\n";
      }
    } # <- does not have CDS
    if ($hasCDS) {
      print FTBL "\t\t\tprotein_id\tgnl\|$dbname|p$tnum.$locid\n";
    }
    print FTBL "\t\t\ttranscript_id\tgnl\|$dbname|t$tnum.$locid\n";
    my $note = $tcount>1 ? 'alternatively spliced' : '';
    
    if ($hasCoding && !$hasCDS) {
      #a non-coding transcript in a coding gene ! NCBI doesn't like this
      $note.= '; ' if $note;
      $note.= 'unknown coding status';
    }

    #FIXME: just for our particular case here (GMAP alignment-based annotation)
    my $simdb='RAP-DB'; #FIXME: this should be changed accordingly
    if ($simid =~ m/^ENS/) {
      $simdb='ENSEMBL';
    }
    #elsif ($simid=~m/^EPlOS/) {
    #  $db='IRGSP';
    #}

    ### -----FIXME: if the evidence is based on protein similarity, should better move this "note" block
    ### after the CDS and its protein_id and transcript_id are written below
    my $simto;
    if ($simid) {
      $simto = $simdb ? "$simdb:$simid" : $simid ;
      $simto .= ", $simprod" if $simprod;
    }
    if ($simto) {
     # $note.="similar to $simid, $simprod";
     $note.= '; ' if $note;
     $note.="similar to $simto";
    }
    #else { 
    #   #FIXME: this is NOT usually the case! (the transcript ID may not be a known protein/transcript ID)
    #   $note.="similar to $simid";
    #}
    print FTBL "\t\t\tnote\t$note\n" if $note;
    ##FIXME: these should be changed, now hard-coded for rice CG annotation
    #print FTBL "\t\t\tinference\tsimilar to RNA sequence, mRNA:$simdb:$simid\n";
    if ($simdb && $simid) { #FIXME: this just applies to rice transplanted annotation:
      print FTBL "\t\t\tdb_xref\t$simdb:$simid\n";
    }
    #print FTBL "\t\t\tinference\tsimilar to RNA sequence\n"; #FIXME: change this as needed!
    print FTBL "\t\t\tinference\talignment:GMAP:2019-03-04:$simdb:$simid\n";
    if ($hasCDS) {
      writeExons($strand, $CDS, $pp, 'CDS'); #uses global $ntseq
      print FTBL "\t\t\tproduct\thypothetical protein\n";
      print FTBL "\t\t\tprotein_id\tgnl\|$dbname|p$tnum.$locid\n";
      print FTBL "\t\t\ttranscript_id\tgnl\|$dbname|t$tnum.$locid\n";
      print FTBL "\t\t\tnote\t$note\n" if $note;
    }
    # if our evidence is protein similarity, we should write the 'similar to' 
    # note here, instead of the ebove
  } #for each transcript
}

sub checkFrame {
 #return 1;
 my ($rx, $rev, $rlastc)=@_; # $rx = [ [$exonstart, $exonend, $frame], ... ]
 my ($s, $f);
 my @xrange= map { $_->[0].'-'.$_->[1] } @$rx;
 #print STDERR "Checking FRAME for exons: ".join(',',@xrange)."\n";

 # print STDERR "checking frame for ".scalar(@$rx). " exons..\n";
 if ($rev) {
   map { $s.=substr($ntseq, $_->[1]-1, $_->[0]-$_->[1]+1) } (reverse(@$rx));
   $s=reverseComplement($s);
   # $f=$$rx[-1]->[2];
   }
  else {
   map { $s.=substr($ntseq, $_->[0]-1, $_->[1]-$_->[0]+1) } @$rx;
   #first check if the given frame is OK
   }
 # print STDERR join("\n", unpack("(A80)*",$s))."\n";
 my %fr=( 0=>1, 1=>1, 2=>1 );
 my @frames; # frames to try, in the order of priority:
 # 1st the frame that would cover the end precisely
 
 push(@frames,length($s)%3);
 delete $fr{$frames[0]};
 
 # 2nd - is the originally declared frame 
 $f=$$rx[0]->[2];
 $f=0 unless $f>0;
 if ($f!=$frames[0]) {
    push(@frames, $f);
    delete $fr{$f};
    }
 #add the rest of the frames to check
 my @frest=keys(%fr);
 push(@frames, @frest);

 my ($stopfound, $newf);
 my %lastcodon; # last codons in each frame
 foreach my $frame (@frames) {
   # print STDERR ".. trying frame $frame\n";
   my @c=unpack('A'.$frame.'(A3)*',$s);
   shift(@c);$lastcodon{$frame}=pop(@c);
   $stopfound=0;
   foreach my $codon (@c) { 
     if ($stop{$codon}) {
         $stopfound=1;    
        # print STDERR "    STOP found.\n";
         last;
         }
       }
   $newf=$frame;
   last unless $stopfound;
   }
 return 0 if ($stopfound); # Error: cannot find a "clean" frame!
 #-- new frame suggested
 $$rx[0]->[2]=$newf;
 $$rlastc=$lastcodon{$newf} if $rlastc;
 return 1;
}

sub checkNs {
 my ($gs, $ge)=@_;
 my $encname='NOPE';
 if ($ge<$gs) {
  print STDERR "Warning ($defpre, $encname): genestart>geneend coordinates!\n";
  ($ge,$gs)=($gs,$ge);
  }
 my $gseq=substr($ntseq, $gs-1, $ge-$gs+1);
 my @sgaps;
 my $from=0;
 my $r;
 my @nregs=($gseq=~m/(N{2,})/g);
 my $ni=0;
 while (($r=index($gseq, 'NN',$from))>=0) {
   $from=$r+length($nregs[$ni]);
   push(@sgaps, ($gs+$r).'-'.($gs+$from-1));
   $ni++;
   }
 return @sgaps;  
}

sub getSequencingGaps {
 my @res;
 my @nregs=($ntseq=~m/(N{2,})/g);
 my $from=0;
 my $r;
 my $ni=0; 
 while (($r=index($ntseq, 'NN',$from))>=0) {
   $from=$r+length($nregs[$ni]);
   push(@res, [$r+1, $from]);
   $ni++;
   }
 return @res;
}

sub checkStop {
 my ($rx, $rev, $rlastc)=@_; # ref to list of [$exonstart, $exonend, $frame]
 #my @xrange= map { $_->[0].'-'.$_->[1] } @$rx;
 #print STDERR "..checking ".join(',',@xrange)." for stop codons..\n";
 my ($s, $f);
 if ($rev) {
   map { $s.=substr($ntseq, $_->[1]-1, $_->[0]-$_->[1]+1) } (reverse(@$rx));
   $s=reverseComplement($s);
   $f=$$rx[-1]->[2];
   }
  else {
   map { $s.=substr($ntseq, $_->[0]-1, $_->[1]-$_->[0]+1) } @$rx;
   #first check if the given frame is OK
   }
 print STDERR "ERROR: invalid sequence length at $_[0]->[0]-$_[0]->[1]\n" if length($s)<=3;
 $f=$$rx[0]->[2];
 $f=0 unless $f>0;
 my @c=unpack('A'.$f.'(A3)*',$s);
 shift(@c);
 my $lastcodon=pop(@c);
 $$rlastc=$lastcodon if $rlastc;
 my $stopfound;
 #print STDERR ".. trying original frame: $f\n";
 foreach my $codon (@c) { if ($stop{$codon}) {
    $stopfound=1;
    #print STDERR "    STOP found.\n";
    last;}
    }
 return $stopfound;
 }

sub reverseComplement {
  my $s=reverse($_[0]);
  $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
  return $s;
 }


sub up2gb {
 my $uid=shift(@_);
 my $first;
 $first='UP2GB_'.$uid;
 ($first)=($first=~m/^([\w\.]+)/) if $first;
 print STDERR "    up2gb:  $uid -> '$first'\n" if $debug;
 #unless ($first) {
 #  print STDERR "WARNING: no GB accession returned for UniProt $uid!".
 #    "($sql)\n";
 #  }
 return $first; 
}