diff --git a/ontobio/ontol.py b/ontobio/ontol.py index 191d003d..5296fac7 100644 --- a/ontobio/ontol.py +++ b/ontobio/ontol.py @@ -598,12 +598,49 @@ def logical_definitions(self, nid): else: return [] + def definition(self, nid): + """ + Text definition object for a node + + Arguments + --------- + nid : str + Node identifier for entity to be queried + + Return + ------ + dict + definition object, dict(val=TEXT, xrefs=LIST) + """ + return self._get_meta_prop(nid, 'definition') + + def definition_val(self, nid): + """ + Text definition string value for a node + + Arguments + --------- + nid : str + Node identifier for entity to be queried + + Return + ------ + str + text definition + """ + defn = self.definition(nid) + if defn is None: + return None + else: + return defn['val'] + + def get_node_type(self, nid): n = self.node(nid) if 'type' in n: return n['type'] return None - + def _get_meta_prop(self, nid, prop): n = self.node(nid) if 'meta' in n: @@ -763,7 +800,7 @@ def xrefs(self, nid, bidirectional=False): nid : str Node identifier for entity to be queried bidirection : bool - If True, include nodes xreffed to nid + If True, include nodes that xref nid Return ------ @@ -773,7 +810,7 @@ def xrefs(self, nid, bidirectional=False): xg = self.xref_graph if nid not in xg: return [] - if bidirectional: + elif bidirectional: return xg.neighbors(nid) else: return [x for x in xg.neighbors(nid) if xg[nid][x]['source'] == nid] diff --git a/ontobio/sim.py b/ontobio/sim.py new file mode 100644 index 00000000..70cef000 --- /dev/null +++ b/ontobio/sim.py @@ -0,0 +1,105 @@ +import math +import pandas as pd + +class SimEngine(): + + def __init__(self, + association_set=None, + icmap=None): + self.association_set = association_set + self.icmap = icmap + + def _get_icmap(self): + if self.icmap is None: + icmap = {} + aset = self.association_set + num_subjs = len(asset.subjects) + for n in aset.ontology.nodes(): + num_anns = len(aset.query([n])) + freq = num_anns / num_subjs + ic = None + if freq > 0: + ic = -math.log(freq/num_subjs) / math.log(2) + icmap[n] = ic + self.icmap = icmap + return self.icmap + + def information_content(self,nid): + """ + Returns information content for a node + """ + icmap = self._get_icmap() + return icmap[nid] + + def entity_jaccard_similarity(self,s1,s2): + """ + Calculate jaccard index of inferred associations of two subjects + + |ancs(s1) /\ ancs(s2)| + --- + |ancs(s1) \/ ancs(s2)| + + """ + a1 = self.association_set.inferred_types(s1) + a2 = self.association_set.inferred_types(s2) + num_union = len(a1.union(a2)) + if num_union == 0: + return 0.0 + return len(a1.intersection(a2)) / num_union + + def class_jaccard_similarity(self,c1,c2): + """ + Calculate jaccard index of two classes + + |ancs(c1) /\ ancs(c2)| + --- + |ancs(c1) \/ ancs(c2)| + + """ + ont = self.association_set.ontology + a1 = ont.ancestors(c1,reflexive=True) + a2 = ont.ancestors(c2,reflexive=True) + num_union = len(a1.union(a2)) + if num_union == 0: + return 0.0 + return len(a1.intersection(a2)) / num_union + + def class_resnik_similarity(self,c1,c2): + """ + Calculate resnik similarty of two classes + + Return + ------ + (number,list) + tuple of max_ic and list of MRCAs + """ + cas = self.common_ancestors(c1,c2) + pairs = [(a, self.information_content(a)) for a in cas] + max_ic = 0 + mrcas = [] + for a,ic in pairs: + if ic > max_ic: + max_ic = ic + mrcas = [a] + elif ic == max_ic: + mrcas.append(a) + return max_ic, mrcas + + def used_classes(self): + aset = self.association_set + cset = set() + for s in aset.subjects: + cset.update(aset.inferred_types(s)) + return cset + + def dataframe(self): + aset = self.association_set + entries = [] + subjs = aset.subjects + for s in subjs: + vmap = {} + for c in aset.inferred_types(s): + vmap[c] = 1 + entries.append(vmap) + df = pd.DataFrame(entries, index=subjs) + return df diff --git a/tests/test_local_json.py b/tests/test_local_json.py index 613ef1ef..5a92cf25 100644 --- a/tests/test_local_json.py +++ b/tests/test_local_json.py @@ -142,6 +142,11 @@ def test_graph(): assert NIF_CELL in xrefs assert len(xrefs) == 2 + def_val = ont.definition_val(CELL) + assert def_val.startswith("The basic structural and functional unit of all organisms") + + defn = ont.definition(CELL) + assert defn['xrefs'] == [ "GOC:go_curators" ] # xrefs are bidirectional xrefs = ont.xrefs(WIKIPEDIA_CELL, bidirectional=True) diff --git a/tests/test_sim.py b/tests/test_sim.py new file mode 100644 index 00000000..791bb849 --- /dev/null +++ b/tests/test_sim.py @@ -0,0 +1,34 @@ +from ontobio.ontol_factory import OntologyFactory +from ontobio.assoc_factory import AssociationSetFactory +from ontobio.assocmodel import AssociationSet +from ontobio.io.gafparser import GafParser +from ontobio.sim import SimEngine +import logging +import random + + + +POMBASE = "tests/resources/truncated-pombase.gaf" +INTRACELLULAR='GO:0005622' +G1 = 'PomBase:SPBC902.04' +def test_sim(): + """ + Test loading from gaf + """ + ofactory = OntologyFactory() + afactory = AssociationSetFactory() + ont = ofactory.create('tests/resources/go-truncated-pombase.json') + aset = afactory.create_from_gaf(open(POMBASE,"r"), + ontology=ont) + + sim = SimEngine(aset) + for g1 in aset.subjects: + print("G1={} '{}'".format(g1, aset.label(g1))) + for g2 in aset.subjects: + print(" G2={} '{}'".format(g2, aset.label(g2))) + jsim = sim.entity_jaccard_similarity(g1,g2) + print(" SIM={}".format(jsim)) + + + +