diff --git a/changelog.txt b/changelog.txt index 7420b10..11cb29f 100644 --- a/changelog.txt +++ b/changelog.txt @@ -246,3 +246,6 @@ Rename -f 0 --> 'interpro' Rename -f 1 --> 'extended' Rename -f 2 --> 'mobidb3' Rename -f 3 --> 'caid' + +3.8.3 | 12/08/2020 +Replace mobidb3 with mobidb4 format diff --git a/config.ini b/config.ini index 9b0ae89..1584262 100644 --- a/config.ini +++ b/config.ini @@ -8,7 +8,7 @@ espritz= fess= globplot= iupred= -jronn= +#jronn= pfilt= seg= vsl2= @@ -18,7 +18,7 @@ mdb=0.625 anchor=0.5 dis465=0.5 disHL=0.086 -dynamine_coil=0.5 +dynamine=0.5 espD=0.5072 espN=0.3089 espX=0.1434 diff --git a/mdblib/consensus.py b/mdblib/consensus.py index 62188fe..5f64be1 100644 --- a/mdblib/consensus.py +++ b/mdblib/consensus.py @@ -81,7 +81,6 @@ def calc_agreement(self, seq, threshold, ptype=None, force_consensus=False): logging.debug('%s | agreement: excluded', prediction.method) - if included_predictors != 0: self.summed_states = agreement agreement = [summed_states / included_predictors for summed_states in agreement] @@ -104,6 +103,17 @@ def __init__(self, prediction_stack, seq, threshold=0.5, force=False): self.prediction.regions = self.prediction.to_regions(start_index=1, positivetag='D') +class MergeConsensus(Consensus): + """ + Define a consensus merging all regions (e.g. for low complexity) + """ + def __init__(self, prediction_stack, seq, threshold=0.1, ptype='disorder', force=True): + logging.debug('Generating Simple consensus') + super(MergeConsensus, self).__init__(prediction_stack) + self.calc_agreement(seq, threshold, ptype=ptype, force_consensus=force) + self.prediction.regions = self.prediction.to_regions(start_index=1, positivetag=1) + + class MobidbLiteConsensus(Consensus): """ Define consensus featured by MobiDB-Lite as its prediction. diff --git a/mdblib/outformats.py b/mdblib/outformats.py index 57ba006..62731f5 100644 --- a/mdblib/outformats.py +++ b/mdblib/outformats.py @@ -148,122 +148,6 @@ def __repr__(self): return "" -class Mobidb3Format(Formatter): - def __init__(self, _acc, _seq, _mdbl_consensus, - _simple_consensus, _single_predictions, **kwargs): - self.seq = _seq - self.seqlen = len(self.seq) - self.mdbl_consensus = _mdbl_consensus - self.simple_consensus = _simple_consensus - self.single_predictions = _single_predictions - self.injecting_data = kwargs.get("injection") - super(Mobidb3Format, self).__init__(_acc, **kwargs) - - if self.multi_accessions: - self.multiply_by_accession("accession") - - def _get_output_obj(self): - out_obj = dict() - - if self.injecting_data is not None: - out_obj.update(self.injecting_data) - - out_obj.setdefault("sequence", self.seq) - - # MobiDB-lite consensus - out_obj \ - .setdefault('mobidb_consensus', dict()) \ - .setdefault('disorder', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': 'mobidb_lite', - 'regions': self.mdbl_consensus.prediction.regions, - 'scores': self.mdbl_consensus.prediction.scores, - 'dc': reduce( - lambda x, t: - x + (t[1] - t[0] + 1), - self.mdbl_consensus.prediction.regions, 0.0) / self.seqlen - if self.mdbl_consensus.prediction.regions else 0.0}) - - # MobiDB-lite consensus sub regions - if self.mdbl_consensus.prediction.regions: - out_obj \ - .setdefault('mobidb_consensus', dict()) \ - .setdefault('disorder', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': 'mobidb_lite_sub', - 'regions': self.mdbl_consensus.enriched_regions_tags - }) - - # Simple consensus - out_obj.setdefault('mobidb_consensus', dict()) \ - .setdefault('disorder', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': 'simple', - 'regions': self.simple_consensus.prediction.regions, - 'dc': reduce( - lambda x, t: - x + (t[1] - t[0] + 1), - self.simple_consensus.prediction.regions, 0.0) / self.seqlen - if self.simple_consensus.prediction.regions else 0.0}) - - # Single predictions - for prediction in self.single_predictions: - if any(t in prediction.types for t in ['disorder', 'lowcomp']): - prediction.translate_states({1: 'D', 0: 'S'}) - out_obj \ - .setdefault('mobidb_data', dict()) \ - .setdefault('disorder', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': prediction.method, - 'regions': prediction.to_regions(start_index=1, positivetag='D')}) - - if 'sspops' in prediction.types: - method, ptype = prediction.method.split('_') - - out_obj \ - .setdefault('mobidb_data', dict()) \ - .setdefault('ss_populations', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': method, - 'type': ptype, - 'scores': prediction.scores}) - - if 'bindsite' in prediction.types: - prediction.translate_states({1: 'D', 0: 'S'}) - - out_obj \ - .setdefault('mobidb_data', dict()) \ - .setdefault('lips', dict()) \ - .setdefault('predictors', list()) \ - .append( - {'method': prediction.method, - 'regions': prediction.to_regions(start_index=1, positivetag='D')}) - - if out_obj: - out_obj["length"] = self.seqlen - - if re.search("^UPI[A-F0-9]{10}$", self.acc): - out_obj['uniparc'] = self.acc - - else: - out_obj['accession'] = self.acc - - self.isnone = False - - return [out_obj] - - def __repr__(self): - if self.output: - return '\n'.join(json.dumps(oobj) for oobj in self.output) - else: - return "" - - class Mobidb4Format(Formatter): feature_tag = {'PA': 'polyampholyte', # PA @@ -280,12 +164,13 @@ def content_count(self, regions): def __init__(self, _acc, _seq, _mdbl_consensus, - _simple_consensus, _single_predictions, **kwargs): + _simple_consensus, _lowcomp_consensus, _single_predictions, **kwargs): self.seq = _seq self.seqlen = len(self.seq) self.mdbl_consensus = _mdbl_consensus self.simple_consensus = _simple_consensus self.single_predictions = _single_predictions + self.lowcomplexity_consensus = _lowcomp_consensus self.injecting_data = kwargs.get("injection") super(Mobidb4Format, self).__init__(_acc, **kwargs) @@ -301,16 +186,24 @@ def _get_output_obj(self): out_obj.setdefault("sequence", self.seq) # MobiDB-lite consensus - # TODO eliminate regions if empty? count = self.content_count(self.mdbl_consensus.prediction.regions) - out_obj["prediction-disorder-mobidb_lite"] = { - 'regions': [(r[0], r[1]) for r in self.mdbl_consensus.prediction.regions], - 'scores': self.mdbl_consensus.prediction.scores, - 'content_count': count, - 'content_fraction': count / self.seqlen - } + if count: + out_obj["prediction-disorder-mobidb_lite"] = { + 'regions': [(r[0], r[1]) for r in self.mdbl_consensus.prediction.regions], + 'scores': self.mdbl_consensus.prediction.scores, + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } + else: + out_obj["prediction-disorder-mobidb_lite"] = { + 'scores': self.mdbl_consensus.prediction.scores + } # MobiDB-lite consensus sub regions + # TODO check: + # proline_rich + # polar + # cystein_rich ??? if self.mdbl_consensus.prediction.regions: regions = {} @@ -323,47 +216,61 @@ def _get_output_obj(self): out_obj["prediction-{}-mobidb_lite_sub".format(r_type)] = { 'regions': regions[r_type], 'content_count': count, - 'content_fraction': count / self.seqlen + 'content_fraction': round(count / self.seqlen, 3) } # Simple consensus - count = self.content_count(self.simple_consensus.prediction.regions) - out_obj["prediction-disorder-th_50"] = { - 'regions': [(r[0], r[1]) for r in self.simple_consensus.prediction.regions], - 'content_count': count, - 'content_fraction': count / self.seqlen - } + if self.simple_consensus.prediction.regions: + count = self.content_count(self.simple_consensus.prediction.regions) + out_obj["prediction-disorder-th_50"] = { + 'regions': [(r[0], r[1]) for r in self.simple_consensus.prediction.regions], + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } + + if self.lowcomplexity_consensus.prediction.regions: + count = self.content_count(self.lowcomplexity_consensus.prediction.regions) + out_obj["prediction-low_complexity-merge"] = { + 'regions': [(r[0], r[1]) for r in self.lowcomplexity_consensus.prediction.regions], + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } # Single predictions for prediction in self.single_predictions: regions = [(r[0], r[1]) for r in prediction.to_regions(start_index=1, positivetag=1)] count = self.content_count(regions) - if 'disorder' in prediction.types: - out_obj["prediction-disorder-{}".format(prediction.method)] = { - 'regions': regions, - 'content_count': count, - 'content_fraction': count / self.seqlen - } - elif 'lowcomp' in prediction.types: - out_obj["prediction-low_complexity-{}".format(prediction.method)] = { - 'regions': regions, - 'content_count': count, - 'content_fraction': count / self.seqlen - } - elif 'bindsite' in prediction.types: - out_obj["prediction-lip-anchor"] = { - 'regions': regions, - 'content_count': count, - 'content_fraction': count / self.seqlen + if regions: + if 'disorder' in prediction.types: + out_obj["prediction-disorder-{}".format(prediction.method)] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } + elif 'lowcomp' in prediction.types: + out_obj["prediction-low_complexity-{}".format(prediction.method)] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } + elif 'bindsite' in prediction.types: + out_obj["prediction-lip-{}".format(prediction.method)] = { + 'regions': regions, + 'content_count': count, + 'content_fraction': round(count / self.seqlen, 3) + } + + if 'rigidity' in prediction.types: + out_obj["prediction-rigidity-{}".format(prediction.method)] = { + 'scores': prediction.scores } - elif 'sspops' in prediction.types: + + if 'sspops' in prediction.types: method, ptype = prediction.method.split('_') - out_obj["prediction-{}-fess".format(ptype)] = { + out_obj["prediction-{}-{}".format(ptype, method)] = { 'scores': prediction.scores } - # else: - # logging.debug("Type not implemented in mobidb4".format(prediction.types)) if out_obj: out_obj["length"] = self.seqlen diff --git a/mdblib/prediction.py b/mdblib/prediction.py index 39194e7..d217c97 100644 --- a/mdblib/prediction.py +++ b/mdblib/prediction.py @@ -38,36 +38,3 @@ def scores_to_states(self, tags=(1, 0)): """ return [tags[0] if score >= self.threshold else tags[1] for score in self.scores] - # def regions_to_set(self): - # """Get list of ID amino-acids positions from region list - # - # :return: Unique amino-acid index of ID regions - # :rtype: set - # """ - # positions = set() - # for start, end, _ in self.to_regions(): - # positions.update(range(start, end + 1)) - # return positions - # - # def regions_to_states(self, length, tags=('D', 'S'), reg_startindex=1): - # """Represent ID states as a string from a list ID regions - # - # :param length: Length of the protein sequence (num of amino-acids) - # :type length: int - # :param tags: couple of values: positive match tag, negative match tag. Order counts - # :type tags: tuple - # :param reg_startindex: start index of the input regions (default: 1) - # :type reg_startindex: int - # :return: Order/Disorder states of the amino-acids of a protein - # :rtype: str - # """ - # states = "" - # positions = [pos - reg_startindex for pos in self.regions_to_set()] - # - # for i in range(0, length): - # if i in positions: - # states += tags[0] - # else: - # states += tags[1] - # - # return states diff --git a/mdblib/predictor.py b/mdblib/predictor.py index 5013d4a..48acf96 100644 --- a/mdblib/predictor.py +++ b/mdblib/predictor.py @@ -420,8 +420,8 @@ def parse(self, output): class DynaMine(Predictor): - tag = 'dynamine_coil' - types = ['sspops'] + tag = 'dynamine' + types = ['rigidity'] groups = ['mobidb3', 'mobidb4', 'caid'] intype = 'fasta' shared_name = 'dynamine' diff --git a/mobidb_lite.py b/mobidb_lite.py index 671e6f9..30dcd86 100644 --- a/mobidb_lite.py +++ b/mobidb_lite.py @@ -32,8 +32,8 @@ from mdblib.protein import Protein from mdblib.setdirs import set_pred_dir from mdblib.streams import OutStream, InStream -from mdblib.consensus import MobidbLiteConsensus, SimpleConsensus, feature_desc -from mdblib.outformats import InterProFormat, ExtendedFormat, Mobidb3Format, Mobidb4Format, CaidFormat, FastaFormat, VerticalFormat +from mdblib.consensus import MobidbLiteConsensus, MergeConsensus, SimpleConsensus, feature_desc +from mdblib.outformats import InterProFormat, ExtendedFormat, Mobidb4Format, CaidFormat, FastaFormat, VerticalFormat # Suppress warnings warnings.filterwarnings('ignore') @@ -50,7 +50,6 @@ class MobidbLite(object): 'fasta': 'main', 'vertical': 'main', 'extended': 'main', - 'mobidb3': 'mobidb3', 'mobidb4': 'mobidb4', 'caid': 'caid'} @@ -88,9 +87,9 @@ def stream(self): Consume generator produced by MobidbLite.run() and stream output to selected output-stream. Output-stream is selected by user at MobidbLite instantiation time (stdout by default). """ - for prot, s_cons, r_cons, m_cons in self.preds: + for prot, s_cons, r_cons, m_cons, l_cons in self.preds: outobj = self.fmt_output(prot.acc, prot.uniprot_acc, prot.seq, prot.preds, - s_cons=s_cons, r_cons=r_cons, m_cons=m_cons) + s_cons=s_cons, r_cons=r_cons, m_cons=m_cons, l_cons=l_cons) if outobj.isnone is False: self.outstream.write('{}\n'.format(outobj)) @@ -106,10 +105,13 @@ def read_config(config): def parse_json(self): for line in self.instream: doc = json.loads(line) - acc, seq = doc["accession"], doc["sequence"] + acc = doc.get("acc") + seq = doc["sequence"] + # remove field accession to prevent overwriting of existing accession field when injecting in output - del doc["accession"] + # del doc["accession"] # del doc["sequence"] + self.additional_data = doc yield acc, seq @@ -144,7 +146,7 @@ def parse_input(self): "Unrecognized input-filename extension: '{}'. Expected one of {}".format(self.infname.split(".")[-1], fasta_extensions | json_extensions)) - def fmt_output(self, acc, uacc, seq, preds, s_cons, r_cons, m_cons): + def fmt_output(self, acc, uacc, seq, preds, s_cons, r_cons, m_cons, l_cons): output = None multi_acc = None @@ -169,12 +171,8 @@ def fmt_output(self, acc, uacc, seq, preds, s_cons, r_cons, m_cons): elif self.outfmt == 'extended': output = ExtendedFormat(acc, m_cons, r_cons, _multi_accs=multi_acc) - elif self.outfmt == 'mobidb3': - output = Mobidb3Format(acc, seq, m_cons, s_cons, preds, _multi_accs=multi_acc, - injection=self.additional_data) - elif self.outfmt == 'mobidb4': - output = Mobidb4Format(acc, seq, m_cons, s_cons, preds, _multi_accs=multi_acc, + output = Mobidb4Format(acc, seq, m_cons, s_cons, l_cons, preds, _multi_accs=multi_acc, injection=self.additional_data) elif self.outfmt == 'caid': @@ -185,17 +183,19 @@ def fmt_output(self, acc, uacc, seq, preds, s_cons, r_cons, m_cons): def calc_consensus(self, predictions, sequence): simple_c = None relaxed_c = None + lowcomp_merge_c = None mobidblite_c = MobidbLiteConsensus(predictions, sequence, - pappu=True if self.outfmt in ['mobidb3', 'mobidb4'] else False, + pappu=True if self.outfmt in ['mobidb4'] else False, force=self.force_consensus) if self.outfmt == 'extended': relaxed_c = SimpleConsensus(predictions, sequence, force=self.force_consensus, threshold=.375) - if self.outfmt in ['mobidb3', 'mobidb4']: + if self.outfmt in ['mobidb4']: simple_c = SimpleConsensus(predictions, sequence, force=self.force_consensus) + lowcomp_merge_c = MergeConsensus(predictions, sequence, threshold=0.1, ptype='lowcomp', force=True) - return simple_c, relaxed_c, mobidblite_c + return simple_c, relaxed_c, mobidblite_c, lowcomp_merge_c def run(self, fasta, architecture, threads, outfile): @@ -218,8 +218,8 @@ def run(self, fasta, architecture, threads, outfile): processes=threads) if predictions: - smp_consensus, rlx_consensus, mdbl_consensus = self.calc_consensus(predictions, sequence) - yield protein, smp_consensus, rlx_consensus, mdbl_consensus + smp_consensus, rlx_consensus, mdbl_consensus, lowcomp_consensus = self.calc_consensus(predictions, sequence) + yield protein, smp_consensus, rlx_consensus, mdbl_consensus, lowcomp_consensus if __name__ == "__main__": diff --git a/readme.md b/readme.md index 69564da..8c8c5a0 100644 --- a/readme.md +++ b/readme.md @@ -2,7 +2,7 @@ MobiDB-lite, long disorder consensus predictor ============================================== Marco Necci, Damiano Piovesan and Silvio C.E. Tosatto -Version 3.8.2 +Version 3.8.3 Introduction ------------ @@ -77,7 +77,7 @@ option has to be used:: The output is provided in 3 different formats. The output format can be chosen with the ``-f`` option, which accepts one of the following values: `interpro`. -`fasta`, `vertical`, `extended`, `mobidb3`, `caid`. The default short version +`fasta`, `vertical`, `extended`, `mobidb4`, `caid`. The default short version (option `interpro`) includes the protein name and start/end position of the disorder regions, one region per line. All elements are separated by a TAB:: @@ -171,224 +171,40 @@ at a threshold of .375 (New in version 3.8.1) :: } -Option `mobidb3` provides an extended output and structured output that is used -to create annotations for MobiDB3 entries:: - +Option `mobidb4` provides an extended output and structured output that is used +to create annotations for MobiDB4 entries:: { - "sequence": "MKA...IGN", - "mobidb_consensus": { - "disorder": { - "predictors": [ - { - "method": "mobidb_lite", - "regions": [ - [609, 706, "D_WC"], - [792, 820, "D_WC"], - [874, 893, "D_WC"], - [947, 1566, "D_WC"] - ], - "scores": [ - 0.75, - 0.75, - 0.75, - ... - 0.625, - 0.75, - 0.75 - ], - "dc": 0.48513598987982287 - }, - { - "method": "mobidb_lite_sub", - "regions": [ - [652, 681, "D_PO"], - [806, 820, "D_PO"], - ... - ] - }, - { - "method": "simple", - "regions": [ - [1, 16, "D"], - [89, 89, "D"], - ... - ], - "dc": 0.6299810246679317 - } - ] - } - }, - "mobidb_data": { - "disorder": { - "predictors": [ - { - "method": "iupl", - "regions": [ - [1, 15, "D"], - [82, 82, "D"], - ... - ] - }, - { - "method": "iups", - "regions": [ - [1, 13, "D"], - [538, 572, "D"], - ... - ] - }, - { - "method": "espN", - "regions": [ - [1, 33, "D"], - [49, 57, "D"], - ... - ] - }, - { - "method": "espD", - "regions": [ - [990, 1581, "D"] - ] - }, - { - "method": "espX", - "regions": [ - [1, 17, "D"], - [591, 592, "D"], - ... - ] - }, - { - "method": "glo", - "regions": [ - [4, 4, "D"], - [6, 6, "D"], - ... - ] - }, - { - "method": "dis465", - "regions": [ - [1, 16, "D"], - [51, 55, "D"], - ... - ] - }, - { - "method": "disHL", - "regions": [ - [1, 13, "D"], - [25, 36, "D"], - ... - ] - }, - { - "method": "vsl", - "regions": [ - [1, 34, "D"], - [48, 54, "D"], - ... - ] - }, - { - "method": "jronn", - "regions": [ - [1, 21, "D"], - [82, 98, "D"], - ... - ] - }, - { - "method": "seg", - "regions": [ - [8, 23, "D"], - [551, 574, "D"], - ... - ] - }, - { - "method": "pfilt", - "regions": [ - [1094, 1109, "D"], - [1245, 1256, "D"], - ... - ] - } - ] - }, - "ss_populations": { - "predictors": [ - { - "method": "fess", - "type": "helix", - "scores": [ - 0.004, - 0.115, - 0.18, - ... - 0.233, - 0.083, - 0.003 - ] - }, - { - "method": "fess", - "type": "sheet", - "scores": [ - 0.002, - 0.065, - ... - 0.418, - 0.206, - 0.002 - ] - }, - { - "method": "fess", - "type": "coil", - "scores": [ - 0.994, - 0.82, - 0.758, - ... - 0.348, - 0.71, - 0.994 - ] - }, - { - "method": "dynamine", - "type": "coil", - "scores": [ - 0.304, - 0.315, - 0.334, - ... - 0.216, - 0.25, - 0.257 - ] - } - ] + + "acc": "sp|Q15648|MED1_HUMAN", + "sequence": "MKAQASQAL...", + "length": 1581, + "prediction-disorder-mobidb_lite": { + "regions": [[609, 706], [792, 820], [874, 893], [947, 1566]], + "scores": [0.75, 0.75, 0.75, 0.875, 0.75, 0.875, ... ], + "content_count": 767, + "content_fraction": 0.485 }, - "lips": { - "predictors": [ - { - "method": "anchor", - "regions": [ - [17, 24, "D"], - [61, 62, "D"], - ... - ] - } - ] - } - }, - "length": 1581, - "accession": "sp|Q15648|MED1_HUMAN" + "prediction-disorder-th_50": {...}, + "prediction-low_complexity-merge": {...}, + "prediction-disorder-iupl": {...}, + "prediction-disorder-iups": {...}, + "prediction-disorder-espN": {...}, + "prediction-disorder-espD": {...}, + "prediction-disorder-espX": {...}, + "prediction-disorder-glo": {...}, + "prediction-disorder-dis465": {...}, + "prediction-disorder-disHL": {...}, + "prediction-disorder-vsl": {...}, + "prediction-low_complexity-seg": {...}, + "prediction-low_complexity-pfilt": {...}, + "prediction-helix-fess": {...}, + "prediction-sheet-fess": {...}, + "prediction-coil-fess": {...}, + "prediction-rigidity-dynamine": {...}, + "prediction-lip-anchor": {...} } + Option `caid` provides a very extended output, containing both scores and regions for each predictor, but it's restricted to disorder predictors plus DynaMine::