Skip to content

Commit

Permalink
remove --no-cds flasg for all commands except create and go, no_cds f…
Browse files Browse the repository at this point in the history
…lag is written to anchor files for convenience, fix combine of no_cds anchor files
  • Loading branch information
trichter committed Sep 2, 2024
1 parent e5f258b commit 9fb1c73
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 31 deletions.
33 changes: 18 additions & 15 deletions anchorna/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ def _cmd_go(fname, fname_anchor, pbar=True, continue_with=None,

def _cmd_print(fname_anchor, verbose=False, mode='aa'):
anchors = load_selected_anchors(fname_anchor)
if anchors.no_cds:
mode = 'aa'
try:
print(anchors.tostr(verbose=verbose, mode=mode))
except BrokenPipeError:
Expand All @@ -178,6 +180,8 @@ def _cmd_load(fname_anchor):
def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff'):
assert mode in ('nt', 'cds', 'aa')
anchors = load_selected_anchors(fname_anchor)
if anchors.no_cds:
mode = 'aa'
if fmt == 'jalview':
outstr = jalview_features(anchors, mode=mode, score_use_fluke=score_use_fluke)
if out is None:
Expand All @@ -191,27 +195,31 @@ def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff'):
anchors.write(out, mode=mode)


def _cmd_view(fname_anchor, fname, mode='aa', no_cds=False, align=None, score_use_fluke=None):
def _cmd_view(fname_anchor, fname, mode='aa', align=None, score_use_fluke=None):
assert mode in ('nt', 'cds', 'aa')
with tempfile.TemporaryDirectory(prefix='anchorna') as tmpdir:
fname_export = Path(tmpdir) / 'jalview_features.txt'
fnameseq = Path(tmpdir) / 'aa_or_seq_or_cds.fasta'
_cmd_export(fname_anchor, fname_export, mode=mode, score_use_fluke=score_use_fluke, fmt='jalview')
seqs = read(fname)
anchors = read_anchors(fname_anchor)
if mode != 'nt' and not no_cds:
if anchors.no_cds:
mode = 'aa'
if mode != 'nt' and not anchors.no_cds:
offsets = {f.seqid: f.offset for anchor in anchors for f in anchor}
if all(seq.fts.get('cds') for seq in seqs):
offsets_cds = {seq.id: seq.fts.get('cds').loc.start for seq in seqs}
else:
offsets_cds = None
if offsets_cds == offsets:
seqs = seqs[:, 'cds']
if mode == 'aa':
seqs = seqs.translate(complete=True, final_stop=False)
else:
for seq in seqs:
seq.data = seq.data[offsets[seq.id]:]
if mode == 'aa':
seqs = seqs.translate(check_start=False, complete=True)
if mode == 'aa':
seqs = seqs.translate(complete=True)
if align:
anchor = anchors[int(align.lower().removeprefix('a'))]
start = max(_apply_mode(fluke.start, fluke.offset, mode=mode) for fluke in anchor)
Expand All @@ -226,12 +234,12 @@ def _cmd_combine(fname_anchor, out):
anchors = combine(lot_of_anchors)
anchors.write(out)

def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None, no_cds=False):
def _cmd_cutout(fname, fname_anchor, pos1, pos2, out, fmt, mode='nt', score_use_fluke=None):
assert mode in ('nt', 'cds', 'aa')
if no_cds:
mode = 'aa'
seqs = read(fname)
anchors = load_selected_anchors(fname_anchor)
if anchors.no_cds:
mode = 'aa'
seqs2 = cutout(seqs, anchors, pos1, pos2, mode=mode, score_use_fluke=score_use_fluke)
if out is None:
print(seqs2.tofmtstr(fmt or 'fasta'))
Expand All @@ -256,7 +264,7 @@ def run(command, conf=None, pdb=False, **args):
conf = {}
if command in ('export', 'view', 'cutout'):
# ignore all config settings except the following
conf = {k: conf[k] for k in ('fname', 'score_use_fluke', 'no_cds') if k in conf}
conf = {k: conf[k] for k in ('fname', 'score_use_fluke') if k in conf}
# Populate args with conf, but prefer args
conf.update(args)
args = conf
Expand All @@ -268,11 +276,6 @@ def info(type, value, tb):
print()
pdb.pm()
sys.excepthook = info
# for no_cds, commands create, go, view and cutout expect this argument,
# for other commands the default mode is already the correct one
if getattr(args, 'pop' if command == 'export' else 'get')('no_cds', False):
if 'mode' in args:
raise ValueError('Mode may not be specified with --no-cds option')
try:
globals()[f'_cmd_{command}'](**args)
except KeyError:
Expand Down Expand Up @@ -357,7 +360,7 @@ def run_cmdline(cmd_args=None):
if p != p_export:
g.add_argument('--fname', default=argparse.SUPPRESS)
g.add_argument('--score-use-fluke', default=argparse.SUPPRESS, type=int)
g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS)
# g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS)

p_export.add_argument('fname_anchor', help='anchor file name')
p_export.add_argument('-o', '--out', help='output file name (by default prints to stdout)')
Expand All @@ -384,7 +387,7 @@ def run_cmdline(cmd_args=None):
default = 'nt' if p == p_cutout else 'aa'
msg = ('choose mode, nt: relative to original sequence, '
'cds: accounting for offset (usually coding sequence), aa: translated cds '
f'(default: {default})')
f'(default: {default}), for anchors calculated with no_cds option, mode is ignored')
p.add_argument('-m', '--mode', default=argparse.SUPPRESS, choices=choices, help=msg)

# Get command line arguments and start run function
Expand Down
18 changes: 10 additions & 8 deletions anchorna/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,13 @@ def find_my_anchors(seqs, remove=True, aggressive_remove=True,
Find and return anchors in CDS region of nucleotide sequences
"""
if continue_with is None:
all_offset = all('offset' in seq.meta for seq in seqs)
if no_cds:
aas = seqs
all_offset = all('offset' in seq.meta for seq in seqs)
if not all_offset:
for aa in aas:
aa.meta.offset = 0
else:
all_offset = all('offset' in seq.meta for seq in seqs)
all_cds = all(seq.fts.get('cds') for seq in seqs)
if all_offset:
log.info('Found offsets in sequence file, translate full sequence')
Expand All @@ -213,16 +212,14 @@ def find_my_anchors(seqs, remove=True, aggressive_remove=True,
log.info(f'Found {len(anchors)} anchors')
anchors = anchors.merge_neighbor_anchors()
log.info(f'Merged into {len(anchors)} anchors')
anchors.no_cds = no_cds
else:
anchors = continue_with
if remove:
removed_anchors = anchors.remove_contradicting_anchors(aggressive=aggressive_remove)
log.info(f'After removal of contradicting anchors {len(anchors)} anchors left')
else:
removed_anchors = None
if no_cds:
anchors.no_cds = True
removed_anchors.no_cds = True
return anchors.sort(), removed_anchors


Expand Down Expand Up @@ -314,6 +311,11 @@ def combine(lot_of_anchors):
"""
anchors = set()
offsets = {f.seqid: f.offset for anchor in lot_of_anchors[0] for f in anchor}
no_cds_set = set(anchors.no_cds for anchors in lot_of_anchors)
if len(no_cds_set) > 1:
raise ValueError('Some anchors calculated with no_cds option, some without')
no_cds = no_cds_set.pop()
div = 1 if no_cds else 3
for nans in lot_of_anchors:
if len(set(nans) & anchors) > 0:
ids = ', '.join(a.id for a in nans & anchors)
Expand All @@ -322,7 +324,7 @@ def combine(lot_of_anchors):
for f in anchor:
doff = f.offset - offsets[f.seqid]
f.offset = offsets[f.seqid]
f.start += doff // 3
f.stop += doff // 3
f.start += doff // div
f.stop += doff // div
anchors |= set(nans)
return AnchorList(anchors).sort()
return AnchorList(anchors, no_cds=no_cds).sort()
14 changes: 10 additions & 4 deletions anchorna/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@ def write_anchors(anchors, fname, mode=None):
offsets = {ft.seqid: ft.meta._gff.pop('offset') for ft in fts}
offsets_header = ''.join(f'#offset {seqid} {offset}\n' for seqid, offset in offsets.items())
if mode is None:
header_cds = '#no_cds\n' if anchors.no_cds else (
'# Indices are given for amino acids, the offset specifies the offset of index 0 from this file\n'
'# to the beginning of the original sequence (in nucleotides).\n'
)

header = (
f'#AnchoRNA anchor file\n'
f'# written with AnchoRNA v{__version__}\n'
'# Indices are given for amino acids, the offset specifies the offset of index 0 from this file\n'
'# to the beginning of the original sequence (in nucleotides).\n' + offsets_header)
f'# written with AnchoRNA v{__version__}\n' + header_cds + offsets_header)
else:
header = f'# Anchors exported by AnchoRNA v{__version__} with mode {mode}\n'
if fname is None:
Expand All @@ -52,21 +55,24 @@ def read_anchors(fname, check_header=True):
comments = []
fts = read_fts(fname, 'gff', comments=comments)
offsets = {}
no_cds = False
for i, line in enumerate(comments):
if i == 0:
if check_header and not line.startswith('##gff-version 3'):
raise IOError(f'{fname} not a valid GFF file')
elif i == 1:
if check_header and not line.startswith('#AnchoRNA'):
raise IOError(f'{fname} not a valid anchor file')
elif line.startswith('#no_cds'):
no_cds = True
elif line.startswith('#offset'):
seqid, offset = line.split()[1:]
offsets[seqid] = int(offset)
elif line.startswith('#'):
continue
for ft in fts:
ft.meta._gff.offset = offsets[ft.seqid]
return fts2anchors(fts)
return fts2anchors(fts, no_cds=no_cds)


def _parse_selection(anchors, selection):
Expand Down
12 changes: 12 additions & 0 deletions anchorna/tests/test_anchorna.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,18 @@ def test_anchorna_workflow_subset_no_cds():
fname_seqs = tmpdir / 'pesti_example.gff'
assert '' == check('anchorna create --tutorial-subset --no-cds')
assert '' == check('anchorna go --no-pbar anchors.gff')
anchors1 = read_anchors('anchors.gff')
assert anchors1.no_cds

assert '' == check('anchorna create --tutorial-subset')
assert '' == check('anchorna go --no-pbar anchors_cds.gff')
anchors2 = read_anchors('anchors_cds.gff')
anchors2.no_cds = True
for anchor in anchors2:
for fluke in anchor:
fluke.offset = 0
assert anchors1 == anchors2

assert 'A11' in check('anchorna print anchors.gff')
assert 'F1' in check('anchorna print anchors.gff -v')
out1 = check('anchorna combine anchors.gff|a5:a10|a8')
Expand Down
19 changes: 15 additions & 4 deletions anchorna/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,10 @@ def contradicts(self, a2, aggressive=True):


class AnchorList(collections.UserList):
def __init__(self, data=None):
def __init__(self, data=None, no_cds=False):
super().__init__(data)
if no_cds:
self._no_cds = True

def __str__(self):
return self.tostr()
Expand All @@ -189,6 +191,15 @@ def _repr_pretty_(self, p, cycle):
else:
p.text(str(self))

@property
def no_cds(self):
return getattr(self, '_no_cds', False)

@no_cds.setter
def no_cds(self, value):
if value:
self._no_cds = True

def tostr(self, verbose=False, mode='aa'):
return '\n'.join(a.tostr(i=i, verbose=verbose, mode=mode) for i, a in enumerate(self))

Expand Down Expand Up @@ -232,7 +243,7 @@ def remove_contradicting_anchors(self, aggressive=True):
f'keep anchor {a1.guide.start}+{a1.guide.len} with min score {a1.minscore}')
remove_anchors.add(a2)
self.data = [anchor for anchor in self if anchor not in remove_anchors]
return AnchorList(remove_anchors).sort()
return AnchorList(remove_anchors, no_cds=self.no_cds).sort()

def convert2fts(self):
return anchors2fts(self)
Expand Down Expand Up @@ -272,7 +283,7 @@ def anchors2fts(anchors):
return FeatureList(fts)


def fts2anchors(fts):
def fts2anchors(fts, no_cds=False):
"""
Convert sugar.FeatureList object to anchors
Expand Down Expand Up @@ -302,4 +313,4 @@ def fts2anchors(fts):
warn(f'Cannot convert feature of type {ft.type}')
if len(flukes) > 0:
anchors.append(Anchor(flukes, gseqid=gseqid))
return AnchorList(anchors)
return AnchorList(anchors, no_cds=no_cds)

0 comments on commit 9fb1c73

Please sign in to comment.