remove --no-cds flasg for all commands except create and go, no_cds f…

…lag is written to anchor files for convenience, fix combine of no_cds anchor files
rnajena · Sep 2, 2024 · 9fb1c73 · 9fb1c73
1 parent e5f258b
commit 9fb1c73
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 31 deletions.
diff --git a/anchorna/cli.py b/anchorna/cli.py
@@ -167,6 +167,8 @@ def _cmd_go(fname, fname_anchor, pbar=True, continue_with=None,
 
 def _cmd_print(fname_anchor, verbose=False, mode='aa'):
     anchors = load_selected_anchors(fname_anchor)
+    if anchors.no_cds:
+        mode = 'aa'
     try:
         print(anchors.tostr(verbose=verbose, mode=mode))
     except BrokenPipeError:
@@ -178,6 +180,8 @@ def _cmd_load(fname_anchor):
 def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff'):
     assert mode in ('nt', 'cds', 'aa')
     anchors = load_selected_anchors(fname_anchor)
+    if anchors.no_cds:
+        mode = 'aa'
     if fmt == 'jalview':
         outstr = jalview_features(anchors, mode=mode, score_use_fluke=score_use_fluke)
         if out is None:
@@ -191,27 +195,31 @@ def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff'):
         anchors.write(out, mode=mode)
 
 
-def _cmd_view(fname_anchor, fname, mode='aa', no_cds=False, align=None, score_use_fluke=None):
+def _cmd_view(fname_anchor, fname, mode='aa', align=None, score_use_fluke=None):
     assert mode in ('nt', 'cds', 'aa')
     with tempfile.TemporaryDirectory(prefix='anchorna') as tmpdir:
         fname_export = Path(tmpdir) / 'jalview_features.txt'
         fnameseq = Path(tmpdir) / 'aa_or_seq_or_cds.fasta'
         _cmd_export(fname_anchor, fname_export, mode=mode, score_use_fluke=score_use_fluke, fmt='jalview')
         seqs = read(fname)
         anchors = read_anchors(fname_anchor)
-        if mode != 'nt' and not no_cds:
+        if anchors.no_cds:
+            mode = 'aa'
+        if mode != 'nt' and not anchors.no_cds:
             offsets = {f.seqid: f.offset for anchor in anchors for f in anchor}
             if all(seq.fts.get('cds') for seq in seqs):
                 offsets_cds = {seq.id: seq.fts.get('cds').loc.start for seq in seqs}
             else:
                 offsets_cds = None
             if offsets_cds == offsets:
                 seqs = seqs[:, 'cds']
+                if mode == 'aa':
+                    seqs = seqs.translate(complete=True, final_stop=False)
             else:
                 for seq in seqs:
                     seq.data = seq.data[offsets[seq.id]:]
-            if mode == 'aa':
-                seqs = seqs.translate(check_start=False, complete=True)
+                if mode == 'aa':
+                    seqs = seqs.translate(complete=True)
         if align:
             anchor = anchors[int(align.lower().removeprefix('a'))]
             start = max(_apply_mode(fluke.start, fluke.offset, mode=mode) for fluke in anchor)
@@ -226,12 +234,12 @@ def _cmd_combine(fname_anchor, out):
     anchors = combine(lot_of_anchors)
     anchors.write(out)
 
-def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None, no_cds=False):
+def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None):
     assert mode in ('nt', 'cds', 'aa')
-    if no_cds:
-        mode = 'aa'
     seqs = read(fname)
     anchors = load_selected_anchors(fname_anchor)
+    if anchors.no_cds:
+        mode = 'aa'
     seqs2 = cutout(seqs, anchors, pos1, pos2, mode=mode, score_use_fluke=score_use_fluke)
     if out is None:
         print(seqs2.tofmtstr(fmt or 'fasta'))
@@ -256,7 +264,7 @@ def run(command, conf=None, pdb=False, **args):
         conf = {}
     if command in ('export', 'view', 'cutout'):
         # ignore all config settings except the following
-        conf = {k: conf[k] for k in ('fname', 'score_use_fluke', 'no_cds') if k in conf}
+        conf = {k: conf[k] for k in ('fname', 'score_use_fluke') if k in conf}
     # Populate args with conf, but prefer args
     conf.update(args)
     args = conf
@@ -268,11 +276,6 @@ def info(type, value, tb):
             print()
             pdb.pm()
         sys.excepthook = info
-    # for no_cds, commands create, go, view and cutout expect this argument,
-    # for other commands the default mode is already the correct one
-    if getattr(args, 'pop' if command == 'export' else 'get')('no_cds', False):
-        if 'mode' in args:
-            raise ValueError('Mode may not be specified with --no-cds option')
     try:
         globals()[f'_cmd_{command}'](**args)
     except KeyError:
@@ -357,7 +360,7 @@ def run_cmdline(cmd_args=None):
         if p != p_export:
             g.add_argument('--fname', default=argparse.SUPPRESS)
         g.add_argument('--score-use-fluke', default=argparse.SUPPRESS, type=int)
-        g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS)
+        # g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS)
 
     p_export.add_argument('fname_anchor', help='anchor file name')
     p_export.add_argument('-o', '--out', help='output file name (by default prints to stdout)')
@@ -384,7 +387,7 @@ def run_cmdline(cmd_args=None):
         default = 'nt' if p == p_cutout else 'aa'
         msg = ('choose mode, nt: relative to original sequence, '
                'cds: accounting for offset (usually coding sequence), aa: translated cds '
-               f'(default: {default})')
+               f'(default: {default}), for anchors calculated with no_cds option, mode is ignored')
         p.add_argument('-m', '--mode', default=argparse.SUPPRESS, choices=choices, help=msg)
 
     # Get command line arguments and start run function

diff --git a/anchorna/core.py b/anchorna/core.py
@@ -185,14 +185,13 @@ def find_my_anchors(seqs, remove=True, aggressive_remove=True,
     Find and return anchors in CDS region of nucleotide sequences
     """
     if continue_with is None:
+        all_offset = all('offset' in seq.meta for seq in seqs)
         if no_cds:
             aas = seqs
-            all_offset = all('offset' in seq.meta for seq in seqs)
             if not all_offset:
                 for aa in aas:
                     aa.meta.offset = 0
         else:
-            all_offset = all('offset' in seq.meta for seq in seqs)
             all_cds = all(seq.fts.get('cds') for seq in seqs)
             if all_offset:
                 log.info('Found offsets in sequence file, translate full sequence')
@@ -213,16 +212,14 @@ def find_my_anchors(seqs, remove=True, aggressive_remove=True,
         log.info(f'Found {len(anchors)} anchors')
         anchors = anchors.merge_neighbor_anchors()
         log.info(f'Merged into {len(anchors)} anchors')
+        anchors.no_cds = no_cds
     else:
         anchors = continue_with
     if remove:
         removed_anchors = anchors.remove_contradicting_anchors(aggressive=aggressive_remove)
         log.info(f'After removal of contradicting anchors {len(anchors)} anchors left')
     else:
         removed_anchors = None
-    if no_cds:
-        anchors.no_cds = True
-        removed_anchors.no_cds = True
     return anchors.sort(), removed_anchors
 
 
@@ -314,6 +311,11 @@ def combine(lot_of_anchors):
     """
     anchors = set()
     offsets = {f.seqid: f.offset for anchor in lot_of_anchors[0] for f in anchor}
+    no_cds_set = set(anchors.no_cds for anchors in lot_of_anchors)
+    if len(no_cds_set) > 1:
+        raise ValueError('Some anchors calculated with no_cds option, some without')
+    no_cds = no_cds_set.pop()
+    div = 1 if no_cds else 3
     for nans in lot_of_anchors:
         if len(set(nans) & anchors) > 0:
             ids = ', '.join(a.id for a in nans & anchors)
@@ -322,7 +324,7 @@ def combine(lot_of_anchors):
             for f in anchor:
                 doff = f.offset - offsets[f.seqid]
                 f.offset = offsets[f.seqid]
-                f.start += doff // 3
-                f.stop += doff // 3
+                f.start += doff // div
+                f.stop += doff // div
         anchors |= set(nans)
-    return AnchorList(anchors).sort()
+    return AnchorList(anchors, no_cds=no_cds).sort()
diff --git a/anchorna/io.py b/anchorna/io.py
@@ -27,11 +27,14 @@ def write_anchors(anchors, fname, mode=None):
     offsets = {ft.seqid: ft.meta._gff.pop('offset') for ft in fts}
     offsets_header = ''.join(f'#offset {seqid} {offset}\n' for seqid, offset in offsets.items())
     if mode is None:
+        header_cds = '#no_cds\n' if anchors.no_cds else (
+            '# Indices are given for amino acids, the offset specifies the offset of index 0 from this file\n'
+            '# to the beginning of the original sequence (in nucleotides).\n'
+            )
+
         header = (
             f'#AnchoRNA anchor file\n'
-            f'# written with AnchoRNA v{__version__}\n'
-            '# Indices are given for amino acids, the offset specifies the offset of index 0 from this file\n'
-            '# to the beginning of the original sequence (in nucleotides).\n' + offsets_header)
+            f'# written with AnchoRNA v{__version__}\n' + header_cds + offsets_header)
     else:
         header = f'# Anchors exported by AnchoRNA v{__version__} with mode {mode}\n'
     if fname is None:
@@ -52,21 +55,24 @@ def read_anchors(fname, check_header=True):
     comments = []
     fts = read_fts(fname, 'gff', comments=comments)
     offsets = {}
+    no_cds = False
     for i, line in enumerate(comments):
         if i == 0:
             if check_header and not line.startswith('##gff-version 3'):
                 raise IOError(f'{fname} not a valid GFF file')
         elif i == 1:
             if check_header and not line.startswith('#AnchoRNA'):
                 raise IOError(f'{fname} not a valid anchor file')
+        elif line.startswith('#no_cds'):
+            no_cds = True
         elif line.startswith('#offset'):
             seqid, offset = line.split()[1:]
             offsets[seqid] = int(offset)
         elif line.startswith('#'):
             continue
     for ft in fts:
         ft.meta._gff.offset = offsets[ft.seqid]
-    return fts2anchors(fts)
+    return fts2anchors(fts, no_cds=no_cds)
 
 
 def _parse_selection(anchors, selection):

diff --git a/anchorna/tests/test_anchorna.py b/anchorna/tests/test_anchorna.py
@@ -191,6 +191,18 @@ def test_anchorna_workflow_subset_no_cds():
         fname_seqs = tmpdir / 'pesti_example.gff'
         assert '' == check('anchorna create --tutorial-subset --no-cds')
         assert '' == check('anchorna go --no-pbar anchors.gff')
+        anchors1 = read_anchors('anchors.gff')
+        assert anchors1.no_cds
+
+        assert '' == check('anchorna create --tutorial-subset')
+        assert '' == check('anchorna go --no-pbar anchors_cds.gff')
+        anchors2 = read_anchors('anchors_cds.gff')
+        anchors2.no_cds = True
+        for anchor in anchors2:
+            for fluke in anchor:
+                fluke.offset = 0
+        assert anchors1 == anchors2
+
         assert 'A11' in check('anchorna print anchors.gff')
         assert 'F1' in check('anchorna print anchors.gff -v')
         out1 = check('anchorna combine anchors.gff|a5:a10|a8')

diff --git a/anchorna/util.py b/anchorna/util.py
@@ -177,8 +177,10 @@ def contradicts(self, a2, aggressive=True):
 
 
 class AnchorList(collections.UserList):
-    def __init__(self, data=None):
+    def __init__(self, data=None, no_cds=False):
         super().__init__(data)
+        if no_cds:
+            self._no_cds = True
 
     def __str__(self):
         return self.tostr()
@@ -189,6 +191,15 @@ def _repr_pretty_(self, p, cycle):
         else:
             p.text(str(self))
 
+    @property
+    def no_cds(self):
+        return getattr(self, '_no_cds', False)
+
+    @no_cds.setter
+    def no_cds(self, value):
+        if value:
+            self._no_cds = True
+
     def tostr(self, verbose=False, mode='aa'):
         return '\n'.join(a.tostr(i=i, verbose=verbose, mode=mode) for i, a in enumerate(self))
 
@@ -232,7 +243,7 @@ def remove_contradicting_anchors(self, aggressive=True):
                               f'keep anchor {a1.guide.start}+{a1.guide.len} with min score {a1.minscore}')
                     remove_anchors.add(a2)
         self.data = [anchor for anchor in self if anchor not in remove_anchors]
-        return AnchorList(remove_anchors).sort()
+        return AnchorList(remove_anchors, no_cds=self.no_cds).sort()
 
     def convert2fts(self):
         return anchors2fts(self)
@@ -272,7 +283,7 @@ def anchors2fts(anchors):
     return FeatureList(fts)
 
 
-def fts2anchors(fts):
+def fts2anchors(fts, no_cds=False):
     """
     Convert sugar.FeatureList object to anchors
 
@@ -302,4 +313,4 @@ def fts2anchors(fts):
             warn(f'Cannot convert feature of type {ft.type}')
     if len(flukes) > 0:
         anchors.append(Anchor(flukes, gseqid=gseqid))
-    return AnchorList(anchors)
+    return AnchorList(anchors, no_cds=no_cds)