From 8e8d30baf9a2197916920d0a553ba8c885fc2328 Mon Sep 17 00:00:00 2001 From: damiano Date: Tue, 4 Aug 2020 15:35:09 +0200 Subject: [PATCH] Fix logging function Implement mobidb4 format --- mdblib/cli.py | 2 +- mdblib/logger.py | 18 ++----- mdblib/outformats.py | 120 +++++++++++++++++++++++++++++++++++++++++++ mdblib/predictor.py | 28 +++++----- mobidb_lite.py | 16 +++--- 5 files changed, 148 insertions(+), 36 deletions(-) diff --git a/mdblib/cli.py b/mdblib/cli.py index a304ed2..77a88ac 100644 --- a/mdblib/cli.py +++ b/mdblib/cli.py @@ -31,7 +31,7 @@ def arg_parser(cd): help='log level filter. All levels <= choice will be displayed') parser.add_argument('-f', '--outputFormat', default='interpro', - choices=['interpro', 'fasta', 'vertical', 'extended', 'mobidb3', 'caid'], + choices=['interpro', 'fasta', 'vertical', 'extended', 'mobidb3', 'mobidb4', 'caid'], help='output format, see README.md for further details') parser.add_argument('-c', '--conf', type=str, default=os.path.join(cd, 'config.ini'), diff --git a/mdblib/logger.py b/mdblib/logger.py index 708590f..38003d8 100644 --- a/mdblib/logger.py +++ b/mdblib/logger.py @@ -1,19 +1,9 @@ import logging +import sys def set_logger(logfile, level): - handlers = list() - log_formatter = logging.Formatter('%(asctime)s | %(module)-12s | %(levelname)-8s | %(message)s') - if logfile: - file_handler = logging.FileHandler(logfile, 'a') - file_handler.setFormatter(log_formatter) - handlers.append(file_handler) - else: - console_handler = logging.StreamHandler() - console_handler.setFormatter(log_formatter) - handlers.append(console_handler) - - logging.basicConfig(level=level, - format=log_formatter, - handlers=handlers) + logging.basicConfig(level=logging.getLevelName(level), + format='%(asctime)s | %(module)-12s | %(levelname)-8s | %(message)s', + stream=open(logfile, "w") if logfile else sys.stderr) diff --git a/mdblib/outformats.py b/mdblib/outformats.py index c1a87ee..70742ab 100644 --- a/mdblib/outformats.py +++ b/mdblib/outformats.py @@ -264,6 +264,126 @@ def __repr__(self): return "" +class Mobidb4Format(Formatter): + + feature_tag = {'PA': 'polyampholyte', # PA + 'PPE': 'positive_polyelectrolyte', # PPE + 'NPE': 'negative_polyelectrolyte', # NPE + 'CR': 'cystein_rich', # CR + 'PR': 'proline_rich', # PR + 'PO': 'polar', # PO + 'GR': 'glycine_rich', # GR + 'LC': 'low_complexity'} # LC + + def content_count(self, regions): + return reduce(lambda x, t: x + (t[1] - t[0] + 1), regions, 0) + + + def __init__(self, _acc, _seq, _mdbl_consensus, + _simple_consensus, _single_predictions, **kwargs): + self.seq = _seq + self.seqlen = len(self.seq) + self.mdbl_consensus = _mdbl_consensus + self.simple_consensus = _simple_consensus + self.single_predictions = _single_predictions + self.injecting_data = kwargs.get("injection") + super(Mobidb4Format, self).__init__(_acc, **kwargs) + + if self.multi_accessions: + self.multiply_by_accession("accession") + + def _get_output_obj(self): + out_obj = dict() + + if self.injecting_data is not None: + out_obj.update(self.injecting_data) + + out_obj.setdefault("sequence", self.seq) + + # MobiDB-lite consensus + # TODO add content_count, eliminate regions if empty? + count = self.content_count(self.mdbl_consensus.prediction.regions) + out_obj["prediction-disorder-mobidb_lite"] = { + 'regions': [(r[0], r[1]) for r in self.mdbl_consensus.prediction.regions], + 'scores': self.mdbl_consensus.prediction.scores, + 'content_count': count, + 'content_fraction': count / self.seqlen + } + + # MobiDB-lite consensus sub regions + if self.mdbl_consensus.prediction.regions: + + regions = {} + for r in self.mdbl_consensus.prediction.regions: + r_type = self.feature_tag.get(r[2][2:]) + if r_type: + regions.setdefault(r_type, []).append((r[0], r[1])) + for r_type in regions: + count = self.content_count(regions[r_type]) + out_obj["prediction-{}-mobidb_lite_sub".format(r_type)] = { + 'regions': regions[r_type], + 'content_count': count, + 'content_fraction': count / self.seqlen + } + + # Simple consensus + count = self.content_count(self.simple_consensus.prediction.regions) + out_obj["prediction-disorder-th_50"] = { + 'regions': [(r[0], r[1]) for r in self.simple_consensus.prediction.regions], + 'content_count': count, + 'content_fraction': count / self.seqlen + } + + # Single predictions + for prediction in self.single_predictions: + regions = [(r[0], r[1]) for r in prediction.to_regions(start_index=1, positivetag=1)] + count = self.content_count(regions) + + if 'disorder' in prediction.types: + out_obj["prediction-disorder-{}".format(prediction.method)] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': count / self.seqlen + } + elif 'lowcomp' in prediction.types: + out_obj["prediction-low_complexity-{}".format(prediction.method)] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': count / self.seqlen + } + elif 'bindsite' in prediction.types: + out_obj["prediction-lip-anchor"] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': count / self.seqlen + } + elif 'sspops' in prediction.types: + method, ptype = prediction.method.split('_') + out_obj["prediction-{}-fess".format(ptype)] = { + 'scores': prediction.scores + } + # else: + # logging.debug("Type not implemented in mobidb4".format(prediction.types)) + + if out_obj: + out_obj["length"] = self.seqlen + + if re.search("^UPI[A-F0-9]{10}$", self.acc): + out_obj['uniparc'] = self.acc + else: + out_obj['acc'] = self.acc + + self.isnone = False + + return [out_obj] + + def __repr__(self): + if self.output: + return '\n'.join(json.dumps(oobj) for oobj in self.output) + else: + return "" + + class CaidFormat(Formatter): def __init__(self, _acc, seq, _mdbl_consensus, _single_predictions, **kwargs): self.seq = seq diff --git a/mdblib/predictor.py b/mdblib/predictor.py index 32f1582..5013d4a 100644 --- a/mdblib/predictor.py +++ b/mdblib/predictor.py @@ -67,7 +67,7 @@ def parse(self, *args): class IUPredL(Predictor): tag = 'iupl' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'flat' shared_name = 'iupred' @@ -103,7 +103,7 @@ def parse(self, output): class IUPredS(Predictor): tag = 'iups' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'flat' shared_name = 'iupred' @@ -139,7 +139,7 @@ def parse(self, output): class ESpritzN(Predictor): tag = 'espN' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'disbin' shared_name = 'espritz' @@ -166,7 +166,7 @@ def parse(self, output): class ESpritzD(Predictor): tag = 'espD' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'disbin' shared_name = 'espritz' @@ -193,7 +193,7 @@ def parse(self, output): class ESpritzX(Predictor): tag = 'espX' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'disbin' shared_name = 'espritz' @@ -220,7 +220,7 @@ def parse(self, output): class GlobPlot(Predictor): tag = 'glo' types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'flat' shared_name = 'globplot' suppress_stderr = True @@ -243,7 +243,7 @@ def parse(self, output): class DisEMBL(Predictor): tag = ['dis465', 'disHL'] types = ['disorder', 'mobidblite'] - groups = ['main', 'mobidb3', 'caid'] + groups = ['main', 'mobidb3', 'mobidb4', 'caid'] intype = 'flat' shared_name = 'disembl' @@ -269,7 +269,7 @@ def parse(self, output): class VSL2b(Predictor): tag = 'vsl' types = ['disorder'] - groups = ['mobidb3', 'caid'] + groups = ['mobidb3', 'mobidb4', 'caid'] intype = 'flat' shared_name = 'vsl2' suppress_stderr = True @@ -309,7 +309,7 @@ def parse(self, output): class JRonn(Predictor): tag = 'jronn' types = ['disorder'] - groups = ['mobidb3', 'caid'] + groups = ['mobidb3', 'mobidb4', 'caid'] intype = 'fasta' shared_name = 'jronn' @@ -333,7 +333,7 @@ def parse(self, output): class Seg(Predictor): tag = 'seg' types = ['lowcomp'] - groups = ['mobidb3', 'main'] + groups = ['mobidb3', 'mobidb4', 'main'] intype = 'fasta' shared_name = 'seg' @@ -360,7 +360,7 @@ def parse(self, output): class Pfilt(Predictor): tag = 'pfilt' types = ['lowcomp'] - groups = ['mobidb3'] + groups = ['mobidb3', 'mobidb4'] intype = 'fasta' shared_name = 'pfilt' @@ -387,7 +387,7 @@ def parse(self, output): class FESS(Predictor): tag = ['fess_helix', 'fess_sheet', 'fess_coil'] types = ['sspops'] - groups = ['mobidb3'] + groups = ['mobidb3', 'mobidb4'] intype = 'fasta' shared_name = 'fess' @@ -422,7 +422,7 @@ def parse(self, output): class DynaMine(Predictor): tag = 'dynamine_coil' types = ['sspops'] - groups = ['mobidb3', 'caid'] + groups = ['mobidb3', 'mobidb4', 'caid'] intype = 'fasta' shared_name = 'dynamine' @@ -442,7 +442,7 @@ def parse(self, output): class Anchor(Predictor): tag = 'anchor' types = ['bindsite'] - groups = ['mobidb3', 'caid'] + groups = ['mobidb3', 'mobidb4', 'caid'] intype = 'fasta' shared_name = 'anchor' diff --git a/mobidb_lite.py b/mobidb_lite.py index c9e1035..671e6f9 100644 --- a/mobidb_lite.py +++ b/mobidb_lite.py @@ -33,7 +33,7 @@ from mdblib.setdirs import set_pred_dir from mdblib.streams import OutStream, InStream from mdblib.consensus import MobidbLiteConsensus, SimpleConsensus, feature_desc -from mdblib.outformats import InterProFormat, ExtendedFormat, Mobidb3Format, CaidFormat, FastaFormat, VerticalFormat +from mdblib.outformats import InterProFormat, ExtendedFormat, Mobidb3Format, Mobidb4Format, CaidFormat, FastaFormat, VerticalFormat # Suppress warnings warnings.filterwarnings('ignore') @@ -51,6 +51,7 @@ class MobidbLite(object): 'vertical': 'main', 'extended': 'main', 'mobidb3': 'mobidb3', + 'mobidb4': 'mobidb4', 'caid': 'caid'} def __init__(self, fasta, launchdir=None, conf=None, architecture='64', threads=0, outfile=None, @@ -172,6 +173,10 @@ def fmt_output(self, acc, uacc, seq, preds, s_cons, r_cons, m_cons): output = Mobidb3Format(acc, seq, m_cons, s_cons, preds, _multi_accs=multi_acc, injection=self.additional_data) + elif self.outfmt == 'mobidb4': + output = Mobidb4Format(acc, seq, m_cons, s_cons, preds, _multi_accs=multi_acc, + injection=self.additional_data) + elif self.outfmt == 'caid': output = CaidFormat(acc, seq, m_cons, preds, _multi_accs=multi_acc) @@ -181,23 +186,20 @@ def calc_consensus(self, predictions, sequence): simple_c = None relaxed_c = None mobidblite_c = MobidbLiteConsensus(predictions, sequence, - pappu=True if self.outfmt == 'mobidb3' else False, + pappu=True if self.outfmt in ['mobidb3', 'mobidb4'] else False, force=self.force_consensus) if self.outfmt == 'extended': relaxed_c = SimpleConsensus(predictions, sequence, force=self.force_consensus, threshold=.375) - - if self.outfmt == 'mobidb3': + if self.outfmt in ['mobidb3', 'mobidb4']: simple_c = SimpleConsensus(predictions, sequence, force=self.force_consensus) - - return simple_c, relaxed_c, mobidblite_c def run(self, fasta, architecture, threads, outfile): - logging.debug('outfmt: %i outgroup: %s', self.outfmt, self.outgroup) + logging.debug('outfmt: %s outgroup: %s', self.outfmt, self.outgroup) with InStream(fasta) as self.instream, OutStream(outfile) as self.outstream: # Parse input Fasta