From 91c1e09a9edceeeca9e48057c3a9bcc93e273df3 Mon Sep 17 00:00:00 2001 From: Tom Eulenfeld Date: Tue, 1 Oct 2024 17:14:30 +0200 Subject: [PATCH] add export to Dialign anchor files --- CHANGELOG | 1 + anchorna/cli.py | 32 ++++++++++++++++++++++---------- anchorna/io.py | 30 ++++++++++++++++++++++++++++-- anchorna/tests/test_anchorna.py | 4 ++++ 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 205a258..fcc4419 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ dev: * add multiprocessing option (--njobs) * add aggressive mode when removing contradicting anchors (now the default) * add export to GFF files + * add export to Dialign anchor files * change names of some options * add no_cds configuration parameter and --no-cds command line flag, used to find anchors in non-coding and amino acid sequences diff --git a/anchorna/cli.py b/anchorna/cli.py index b19534d..7af2c58 100644 --- a/anchorna/cli.py +++ b/anchorna/cli.py @@ -19,7 +19,7 @@ from sugar import read from anchorna.core import combine, cutout, find_my_anchors -from anchorna.io import jalview_features, read_anchors, load_selected_anchors +from anchorna.io import export_dialign, export_jalview, read_anchors, load_selected_anchors from anchorna.util import _apply_mode @@ -180,13 +180,20 @@ def _cmd_print(fname_anchor, verbose=False, mode='aa'): def _cmd_load(fname_anchor): _start_ipy(read_anchors(fname_anchor)) -def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff'): +def _cmd_export(fname_anchor, out, mode='aa', score_use_fluke=None, fmt='gff', + fname=None): assert mode in ('nt', 'cds', 'aa') anchors = load_selected_anchors(fname_anchor) if anchors.no_cds: mode = 'aa' - if fmt == 'jalview': - outstr = jalview_features(anchors, mode=mode, score_use_fluke=score_use_fluke) + if fmt in ('jalview', 'dialign'): + if fmt == 'jalview': + outstr = export_jalview(anchors, mode=mode, score_use_fluke=score_use_fluke) + else: + if fname is None: + raise ValueError('--fname option missing for Dialign export') + seqs = read(fname) + outstr = export_dialign(anchors, seqs.ids, mode=mode, score_use_fluke=score_use_fluke) if out is None: print(outstr) else: @@ -318,7 +325,7 @@ def run_cmdline(cmd_args=None): p_go = sub.add_parser('go', help=(msg:='find anchors and write them into anchor file'), description=msg) p_print = sub.add_parser('print', help=(msg:='print contents of anchor file'), description=msg) p_load = sub.add_parser('load', help=(msg:='load anchors into IPython session'), description=msg) - p_export = sub.add_parser('export', help=(msg:='export anchors to gff file (possibly using different mode than aa) or a JalView feature file'), description=msg) + p_export = sub.add_parser('export', help=(msg:='export anchors to gff file (possibly using different mode than aa), JalView feature file or Dialign anchor file'), description=msg) p_view = sub.add_parser('view', help=(msg:='view anchors in JalView (i.e. call export and start JalView)'), description=msg) msg = 'combine (and select or remove) anchors from one file or different files' msg2 = ('An expression consists of a file name fname_anchor or fname_anchor|selection to select anchors or fname_anchor||expression to remove anchors, or fname|selection|expression. ' @@ -344,7 +351,7 @@ def run_cmdline(cmd_args=None): p_cutout = sub.add_parser('cutout', help=msg, description=msg, epilog=msg2) - for p in (p_create, p_go, p_view, p_cutout): + for p in (p_create, p_go, p_export, p_view, p_cutout): p.add_argument('-c', '--conf', default='anchorna.conf', help='configuration file to use (default: anchorna.conf)') p_create.add_argument('--tutorial', action='store_true', help='copy GFF sequence file for tutorial') p_create.add_argument('--no-cds', action='store_true', help='add the no_cds line to configuration, effect: directly use aa or nucleotide ' @@ -368,15 +375,20 @@ def run_cmdline(cmd_args=None): g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS) for p in (p_export, p_view, p_cutout): g = p.add_argument_group('optional arguments', description='Use these flags to overwrite values in the config file.') - if p != p_export: - g.add_argument('--fname', default=argparse.SUPPRESS) + g.add_argument('--fname', default=argparse.SUPPRESS) g.add_argument('--score-use-fluke', default=argparse.SUPPRESS, type=int) # g.add_argument('--no-cds', action='store_true', default=argparse.SUPPRESS) p_export.add_argument('fname_anchor', help='anchor file name') p_export.add_argument('-o', '--out', help='output file name (by default prints to stdout)') - p_export.add_argument('--fmt', help='format for export, default gff', default='gff', - choices=('gff', 'jalview')) + msg = ( + 'format for export, default gff, ' + 'for Dialign export, ' + 'please specify sequence filename you want to align with Dialign with --fname option, ' + 'the sequence order is needed for the Dialign export, default fname from config file' + ) + p_export.add_argument('--fmt', help=msg, default='gff', + choices=('gff', 'jalview', 'dialign')) p_view.add_argument('fname_anchor', help='anchor file name') p_view.add_argument('--align', help='align sequences at given anchor') p_combine.add_argument('fname_anchor', nargs='+', help='anchor file name') diff --git a/anchorna/io.py b/anchorna/io.py index c86315a..564ddbc 100644 --- a/anchorna/io.py +++ b/anchorna/io.py @@ -160,14 +160,40 @@ def write_json(obj, fname): json.dump(obj, f, cls=_AnchorJSONEncoder) +def export_dialign(anchors, seqids, mode='aa', score_use_fluke=None): + """ + Export anchors to Dialign anchor file + """ + assert mode in ('nt', 'cds', 'aa') + sortkey_score = lambda f: f.score + sortkey_ids = lambda f: seqids.index(f.seqid) + content = [] + for anchor in anchors: + f0 = anchor.sort(key=sortkey_score)[-1] + start0 = _apply_mode(f0.start, f0.offset, mode) + i = sortkey_ids(f0) + anchor.sort(key=sortkey_ids) + for f in anchor: + j = sortkey_ids(f) + if (f == f0 or score_use_fluke is not None and + f.score < score_use_fluke): + continue + assert f.len == f0.len + start = _apply_mode(f.start, f.offset, mode) + content.append( + f'{i+1} {j+1} {start0+1} {start+1} {f.len} {f.score}\n' + ) + return ''.join(content) + + def _make_rgb_transparent(rgb, bg_rgb, alpha): return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(to_rgb(rgb), to_rgb(bg_rgb))] -def jalview_features(anchors, mode='aa', score_use_fluke=None): +def export_jalview(anchors, mode='aa', score_use_fluke=None): """ - Write anchors to Jalview feature file + Export anchors to Jalview feature file """ assert mode in ('nt', 'cds', 'aa') cols = list(mcolors.TABLEAU_COLORS.values()) diff --git a/anchorna/tests/test_anchorna.py b/anchorna/tests/test_anchorna.py index c0b4c0c..dfd4a99 100644 --- a/anchorna/tests/test_anchorna.py +++ b/anchorna/tests/test_anchorna.py @@ -118,6 +118,10 @@ def test_anchorna_workflow_subset(): assert '' in check('anchorna export -m cds anchors.gff -o test_anchor_export.gff') with pytest.raises(IOError): read_anchors('test_anchor_export.gff') + read_anchors('test_anchor_export.gff', check_header=False) + assert '1' in check('anchorna export --fmt dialign anchors.gff') + assert '1' in check('anchorna export --fmt dialign -m nt anchors.gff') + assert '1' in check('anchorna export --fmt dialign -m cds anchors.gff') with patch('subprocess.run'): # we do not want to actually start jalview here assert '' == check('anchorna view anchors.gff') assert '' == check('anchorna view anchors.gff --align a1')