From e00a5446699f96ce7f01c3d0000d6356c4b39195 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Tue, 27 Jul 2021 11:04:18 +0100 Subject: [PATCH 01/10] declare recontree analyser class --- covasim/analysis.py | 32 ++++++++++++++++++++++++++++++++ covasim/sim.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/covasim/analysis.py b/covasim/analysis.py index 37ca44bfe..1aeac3836 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -2012,3 +2012,35 @@ def plot_histograms(self, start_day=None, end_day=None, bins=None, width=0.8, fi return fig + +class ReconTree(Analyzer): + ''' + A class for holding a reconstructed tree. + + Args: + tt (TransTree): the transmission tree object + test_label (str): the label of the testing intervention to sample diagnoses from. + prop_seq (float): the probability of including a diagnosed individual. + + **Example**:: + + test_label = "my_testing" + testing = cv.test_prob(label=test_label, symp_prob=0.5) + sim = cv.Sim(interventions = testing) + sim.run() + rt = sim.make_recontree(test_label, 0.5) + ''' + + def __init__(self, tt, test_label, prop_seq, **kwargs): + super().__init__(**kwargs) # Initialize the Analyser object + + self.test_label = test_label + self.prop_seq = prop_seq + + return + + def to_newick(self): + ''' + Return a Newick string representation of the reconstructed tree. + ''' + raise NotImplementedError diff --git a/covasim/sim.py b/covasim/sim.py index aaa32a695..5803cb574 100644 --- a/covasim/sim.py +++ b/covasim/sim.py @@ -1227,6 +1227,42 @@ def make_transtree(self, *args, output=True, **kwargs): return + def make_recontree(self, test_label, prop_seq, *args, output=True, **kwargs): + ''' + Create a ReconTree (reconstructed tree) object, for use in phylodynamic + analysis. See cv.ReconTree() for more information. + + Args: + test_label (str): the label of the testing intervention to sample diagnoses from. + prop_seq (float): the probability of including a diagnosed individual. + args (list): passed to cv.ReconTree() and cv.TransTree() + output (bool): whether or not to return the ReconTree; if not, store in sim.results + kwargs (dict): passed to cv.ReconTree() and cv.TransTree() + + **Example**:: + test_label = "my_testing" + testing = cv.test_prob(label=test_label, symp_prob=0.5) + sim = cv.Sim(interventions = testing) + sim.run() + rt = sim.make_recontree(test_label, 0.5) + ''' + assert 0 < prop_seq and prop_seq < 1 + # it only makes sense to ask for a reconstructed tree if there were + # positive tests so better to fail early if there was not a + maybe_test = self.get_intervention(label=test_label) + assert isinstance(maybe_test, cvi.test_prob) or isinstance(maybe_test, cvi.test_num) + del maybe_test + + tt = self.make_transtree(*args, output=True, to_networkx=True, **kwargs) + rt = cva.ReconTree(self, tt, test_label, prop_seq, *args, **kwargs) + if output: + return rt + else: # pragma: no cover + self.results.transtree = tt + self.results.recontree = rt + return + + def plot(self, *args, **kwargs): ''' Plot the results of a single simulation. From bb5f9793e53c92e699dd67a5db759087ee7688d9 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Tue, 27 Jul 2021 11:44:46 +0100 Subject: [PATCH 02/10] include failing test to demonstrate expected usage --- tests/test_analysis.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 2559b76de..2bc123005 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -183,6 +183,22 @@ def test_transtree(): return transtree +def test_recontree(): + sc.heading('Testing reconstructed tree') + + test_label = 'my_test' + tp = cv.test_prob(label=test_label, symp_prob=0.9) + sim = cv.Sim(interventions = tp) + sim.run() + + try: + recontree = sim.make_recontree(test_label, prop_seq=0.5) + recontree.to_newick() + except ImportError as E: + print(f'Could not test conversion to networkx ({str(E)})') + + return recontree + #%% Run as a script if __name__ == '__main__': @@ -197,6 +213,7 @@ def test_transtree(): fit = test_fit() calib = test_calibration() transtree = test_transtree() + recontree = test_recontree() print('\n'*2) sc.toc(T) From 785a5f5a176a19e382b47aa99e3f8f1e0bd80db4 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 10:07:16 +0100 Subject: [PATCH 03/10] add fix from https://github.com/InstituteforDiseaseModeling/covasim/issues/348#issuecomment-887805294 --- covasim/analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/covasim/analysis.py b/covasim/analysis.py index 1aeac3836..54fed83c6 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -1526,8 +1526,8 @@ def __init__(self, sim, to_networkx=False, **kwargs): # Next, add edges from linelist for edge in people.infection_log: - self.graph.add_edge(edge['source'],edge['target'],date=edge['date'],layer=edge['layer']) - + if edge['source'] is not None: # Skip seed infections + self.graph.add_edge(edge['source'],edge['target'],date=edge['date'],layer=edge['layer']) return From 84a60cf0440513e29a4c5e2b4f1c057868238a88 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 12:51:02 +0100 Subject: [PATCH 04/10] fix args in constructor call --- covasim/sim.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/covasim/sim.py b/covasim/sim.py index 5803cb574..06f39f38d 100644 --- a/covasim/sim.py +++ b/covasim/sim.py @@ -1254,7 +1254,9 @@ def make_recontree(self, test_label, prop_seq, *args, output=True, **kwargs): del maybe_test tt = self.make_transtree(*args, output=True, to_networkx=True, **kwargs) - rt = cva.ReconTree(self, tt, test_label, prop_seq, *args, **kwargs) + + all_people = self.people + rt = cva.ReconTree(all_people, tt, test_label, prop_seq, **kwargs) if output: return rt else: # pragma: no cover From 6315bd871cbc64bb1cfdaa9903288ae0a7af9a95 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 16:14:47 +0100 Subject: [PATCH 05/10] fix dodgy call to make_transtree --- covasim/sim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/covasim/sim.py b/covasim/sim.py index 06f39f38d..864f3c862 100644 --- a/covasim/sim.py +++ b/covasim/sim.py @@ -1227,7 +1227,7 @@ def make_transtree(self, *args, output=True, **kwargs): return - def make_recontree(self, test_label, prop_seq, *args, output=True, **kwargs): + def make_recontree(self, test_label, prop_seq, output=True, **kwargs): ''' Create a ReconTree (reconstructed tree) object, for use in phylodynamic analysis. See cv.ReconTree() for more information. @@ -1253,7 +1253,7 @@ def make_recontree(self, test_label, prop_seq, *args, output=True, **kwargs): assert isinstance(maybe_test, cvi.test_prob) or isinstance(maybe_test, cvi.test_num) del maybe_test - tt = self.make_transtree(*args, output=True, to_networkx=True, **kwargs) + tt = self.make_transtree(output=True, to_networkx=True) all_people = self.people rt = cva.ReconTree(all_people, tt, test_label, prop_seq, **kwargs) From 6d8995781de7f98414f6027a4d36e0078a07bf6e Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 16:15:47 +0100 Subject: [PATCH 06/10] first pass at reconstruction --- covasim/analysis.py | 386 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 379 insertions(+), 7 deletions(-) diff --git a/covasim/analysis.py b/covasim/analysis.py index 54fed83c6..1888deced 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -8,10 +8,12 @@ import pylab as pl import pandas as pd import sciris as sc +import re as re from . import utils as cvu from . import misc as cvm from . import interventions as cvi from . import settings as cvset +from . import people as cvppl from . import plotting as cvpl from . import run as cvr try: @@ -19,7 +21,12 @@ except ImportError as E: # pragma: no cover errormsg = f'Optuna import failed ({str(E)}), please install first (pip install optuna)' op = ImportError(errormsg) - +try: + import networkx as nx +except ImportError as E: # pragma: no cover + errormsg = f'NetworkX import failed ({str(E)}), please install first (pip install networkx)' + op = ImportError(errormsg) +import warnings __all__ = ['Analyzer', 'snapshot', 'age_histogram', 'daily_age_stats', 'daily_stats', 'Fit', 'Calibration', 'TransTree'] @@ -2014,13 +2021,18 @@ def plot_histograms(self, start_day=None, end_day=None, bins=None, width=0.8, fi class ReconTree(Analyzer): - ''' - A class for holding a reconstructed tree. + '''A class for holding a reconstructed tree. + + This works by taking a subsample of the diagnosed infections from a + particular intervention and then computes the reconstructed tree from this + sample (without error). Args: + people (People): the people in the simulation tt (TransTree): the transmission tree object test_label (str): the label of the testing intervention to sample diagnoses from. - prop_seq (float): the probability of including a diagnosed individual. + prob_seq (float): the probability of including a diagnosed individual. + verbose (int): the level of verbosity while computing from 0 to 2 **Example**:: @@ -2029,18 +2041,378 @@ class ReconTree(Analyzer): sim = cv.Sim(interventions = testing) sim.run() rt = sim.make_recontree(test_label, 0.5) + ''' - def __init__(self, tt, test_label, prop_seq, **kwargs): + def __init__(self, people, tt, test_label, prob_seq, verbose=0, **kwargs): + super().__init__(**kwargs) # Initialize the Analyser object + assert isinstance(people, cvppl.People) + assert isinstance(tt, TransTree) + assert isinstance(test_label, str) + assert 0 < prob_seq and prob_seq < 1 + + self.people = people + self.tt = tt self.test_label = test_label - self.prop_seq = prop_seq + self.prob_seq = prob_seq + self.verbose = verbose + + if self.verbose == 2: + print("verbosity set to level 2 in ReconTree.") + + self._inf_log = tt.infection_log + if self.verbose == 2: + print("finding the UIDs of the seed nodes from the infection log...") + self._seed_uids = set(e["target"] for e in self._inf_log + if e["layer"] == "seed_infection") + if self.verbose >= 1: + print("The UIDs of the seed infections are") + print(self._seed_uids) + + self._diagnosed = set(p for p in self.people if p.diagnosed) + self._is_diagnosed = {p.uid: p.diagnosed for p in self.people} + self._diagnosis_date = {dp.uid: dp.date_diagnosed + for dp in self._diagnosed} + self._infection_date = {p.uid: p.date_exposed for p in self.people + if not np.isnan(p.date_exposed)} + + self._sequenced = None + self._subset_diagnosed_for_sequencing() + + self._leaf_nodes = None + self._compute_leaf_nodes_dict() + + self.reconstructed_tree = {s: None for s in self._seed_uids} + self.root_label = {s: None for s in self._seed_uids} + self._reconstruct_trees() + + self._parse_root_id = self.__parse_factory(r'^root ([0-9]+) infected on [\.0-9]+$', lambda x: x) + self._parse_diag = self.__parse_factory(r'^diagnosis of ([0-9]+) on [\.0-9]+$', lambda x: x) + self._parse_node_time = self.__parse_factory(r'on ([\.0-9]+)$', float) + self.newick = {s: None for s in self._seed_uids} return + + def __parse_factory(self, pattern, finalise): + ''' + Closure for a parser which can be used to extract values from the node + labels. + ''' + + def parser(string): + if self.verbose == 2: + print("parser with pattern {p} called on string {s}".format(p=pattern, s=string)) + + maybe_match = re.search(pattern, string) + if maybe_match is None: + raise Exception('could not parse the string: ' + + string + + '\ngiven pattern: ' + + pattern) + else: + return finalise(maybe_match.group(1)) + return parser + + def to_newick(self): ''' Return a Newick string representation of the reconstructed tree. + + **Example**:: + + test_label = "my_testing" + testing = cv.test_prob(label=test_label, symp_prob=0.5) + sim = cv.Sim(interventions = testing) + sim.run() + rt = sim.make_recontree(test_label, 0.5) + rt.to_newick() + ''' - raise NotImplementedError + for s in self.reconstructed_tree: + if len(list(self.reconstructed_tree[s].nodes)) > 2: + self._make_newick(s) + + return self.newick + + + def _make_newick(self, seed_uid): + ''' + tree ==> descendant_list [ root_label ] [ : branch_length ] ; + ''' + if self.verbose == 2: + print("making newick for node {n}".format(n=seed_uid)) + + r_label = self._parse_root_id(self.root_label[seed_uid]) + r_time = self._parse_node_time(self.root_label[seed_uid]) + ss = list(self.reconstructed_tree[seed_uid].successors(self.root_label[seed_uid])) + s_time = self._parse_node_time(ss[0]) + b_length = str(s_time - r_time) + d_string = self._descendent_list( + self.reconstructed_tree[seed_uid], + self.root_label[seed_uid], r_time + ) + self.newick[seed_uid] = d_string + r_label + ':' + b_length + ';' + + return + + + def _descendent_list(self, rt, n_label, pred_time): + ''' + descendant_list ==> ( subtree { , subtree } ) + ''' + succs = list(rt.successors(n_label)) + return '(' + ','.join([self._subtree(rt, succ, pred_time) for succ in succs])+ ')' + + + def _subtree(self, rt, n_label, pred_time): + ''' + subtree ==> descendant_list [internal_node_label] [: branch_length] + ==> leaf_label [: branch_length] + ''' + ss = list(rt.successors(n_label)) + curr_time = self._parse_node_time(n_label) + if ss: + succ_time = self._parse_node_time(ss[0]) + branch_length = str(succ_time - curr_time) + if len(ss) > 1: + assert succ_time > curr_time, 'current time is {c} but successor time is {s}'.format(c=curr_time, s=succ_time) + return self._descendent_list(rt, n_label, curr_time) + ':' + branch_length + else: + is_inf = re.match(r'^infection by ([0-9]+) on [\.0-9]+$', n_label) + if is_inf: + assert succ_time > curr_time, 'current time is {c} but successor time is {s}'.format(c=curr_time, s=succ_time) + return self._descendent_list(rt, n_label, curr_time) + ':' + branch_length + else: + return self._descendent_list(rt, n_label, curr_time) + self._parse_diag(n_label) + ':' + branch_length + else: + _diag_time = self._parse_node_time(n_label) + branch_length = str(_diag_time - pred_time) + return self._parse_diag(n_label) + ':' + branch_length + + + def _subset_diagnosed_for_sequencing(self): + '''randomly subsample the diagnosed individuals for inclusion in the + reconstructed tree.''' + if self.verbose > 0: + print("subsampling the diagnosed individuals for sequencing.") + if self._sequenced is None: + num_seq = np.random.binomial(len(self._diagnosed), self.prob_seq) + if self.verbose > 1: + print("{num_seq} diagnosed individuals were selected for sequencing.".format(num_seq=num_seq)) + self._sequenced = set( + np.random.choice( + np.array(list(self._diagnosed)), + size = num_seq, + replace = False + ) + ) + + return + + def seeding_ancestor(self, n): + '''the seed that has the given node as a descendent.''' + if self.verbose == 2: + print("looking for the ancestor of {n}".format(n=n)) + assert self.tt.graph.has_node(n) + ancs = list(self.tt.graph.predecessors(n)) + if len(ancs) == 0: + return n + else: + return self.seeding_ancestor(ancs[0]) + + def _compute_leaf_nodes_dict(self): + '''construct the dictionary of sequenced samples descending from each seed to + act as the leaves of each tree.''' + self._leaf_nodes = {s: set() for s in self._seed_uids} + for seq_node in self._sequenced: + self._leaf_nodes[self.seeding_ancestor(seq_node.uid)].add(seq_node.uid) + + return + + + def _reconstruct_trees(self): + for seed_uid in self._seed_uids: + l2r_uids = self._leaf_to_root(seed_uid) + self._root_to_leaf(seed_uid, l2r_uids) + + return + + + def _leaf_to_root(self, seed_uid): + '''return the UIDs of the people that are relevant to the reconstruction of the + tree with the seed and with the given leaves.''' + leaf_people = self._leaf_nodes[seed_uid] + curr_people = set(leaf_people) + next_people = set() + result = set(p for p in curr_people) + + loop_counter = 0 + max_loops = 1000 # TODO this should be equal to the number of + # generations of transmission. + while loop_counter < max_loops: + for cp in curr_people: + for np in self.tt.graph.predecessors(cp): + if np is not None: + next_people.add(np) + + if len(next_people) > 0: + for p in next_people: + result.add(p) + curr_people, next_people = next_people, set() + loop_counter += 1 + else: + break + + return result + + def _root_to_leaf(self, seed_uid, l2r_uids): + rt = self.tt.graph.subgraph(l2r_uids).copy() + curr_nodes = [seed_uid] + loop_count = 0 + + # TODO this should be equal to the depth of the current value of the + # reconstructed tree. + max_loops = 1000 + + while len(curr_nodes) > 0 and loop_count < max_loops: + loop_count += 1 + cn = curr_nodes.pop() + + if not rt.has_node(cn) and cn in self._seed_uids: + warnings.warn("node {cn} does not appear where expected...".format(cn=cn), RuntimeWarning) + continue + + succs = list(rt.successors(cn)) + num_succs = len(succs) + curr_nodes = succs + curr_nodes + + preds = list(rt.predecessors(cn)) + num_preds = len(preds) + if num_preds == 1: + if num_succs == 1: + if self._is_diagnosed[cn]: + self._resolve_diagnosed(rt, cn) + else: + self._remove_undiagnosed(rt, cn) + elif num_succs > 1: + self._split_node(rt, cn, preds[0], succs) + else: + # because in this branch we know the number of successors + # is zero so this must be a diagnosis node because we are + # working with the reduced tree.. + leaf_name = "diagnosis of {cn} on {d}".format(cn=cn, d=self._diagnosis_date[cn]) + + if self.verbose == 2: + print("relabelling node {cn} to \"{nn}\"".format(cn=cn, nn=leaf_name)) + + nx.relabel.relabel_nodes(rt, {cn: leaf_name}, copy=False) + elif num_preds == 0: + root_label = "root {cn} infected on {inf_d}".format(cn=cn, inf_d=self._infection_date[cn]) + self.root_label[seed_uid] = root_label + nx.relabel.relabel_nodes(rt, {cn: root_label}, copy=False) + else: + raise Exception("violation of tree structure.") + + self.reconstructed_tree[seed_uid] = rt + + return + + + def _resolve_diagnosed(self, rt, curr_node): + + new_id = "diagnosis of {curr_node} on {diag_date}".format( + curr_node=curr_node, + diag_date=self._diagnosis_date[curr_node] + ) + + if self.verbose == 2: + print("relabelling node {cn} to \"{nn}\"".format(cn=curr_node, nn=new_id)) + + nx.relabel.relabel_nodes(rt, {curr_node: new_id}, copy=False) + return + + def _split_node(self, rt, curr_node, pred, succs): + if self._is_diagnosed[curr_node]: + self._split_diagnosed(rt, curr_node, pred, succs) + else: + self._split_undiagnosed(rt, curr_node) + return + + def _split_diagnosed(self, rt, curr_node, pred, succs): + + if self.verbose == 2: + print("splitting the node of {curr_node}".format(curr_node=curr_node)) + + diag_date = self._diagnosis_date[curr_node] + inf_dates = list(set(self._infection_date[s] for s in succs)) + inf_dates.sort() + + if diag_date in inf_dates: + raise NotImplemented("case of diagnosis occurring on the same day as infection.") + else: + pre_diag_inf_dates = filter(lambda d: d < diag_date, inf_dates) + post_diag_inf_dates = filter(lambda d: d > diag_date, inf_dates) + + tmp = pred + for inf_d in pre_diag_inf_dates: + ss = filter(lambda s: self._infection_date[s] == inf_d, succs) + inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) + rt.add_node(inf_node_id) + rt.add_edge(tmp, inf_node_id) + for s in ss: + rt.add_edge(inf_node_id, s) + tmp = inf_node_id + + nid = "diagnosis of {n} on {d}".format(n=curr_node, d=diag_date) + rt.add_node(nid) + rt.add_edge(tmp, nid) + tmp = nid + + for inf_d in post_diag_inf_dates: + ss = filter(lambda s: self._infection_date[s] == inf_d, succs) + inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) + rt.add_node(inf_node_id) + rt.add_edge(tmp, inf_node_id) + for s in ss: + rt.add_edge(inf_node_id, s) + tmp = inf_node_id + + rt.remove_node(curr_node) + + return + + + def _split_undiagnosed(self, rt, curr_node): + preds = list(rt.predecessors(curr_node)) + succs = list(rt.successors(curr_node)) + + inf_dates = list(set(self._infection_date[s] for s in succs)) + inf_dates.sort() + + tmp = preds[0] + for inf_d in inf_dates: + ss = [s for s in succs if self._infection_date[s] == inf_d] + inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) + rt.add_node(inf_node_id) + rt.add_edge(tmp, inf_node_id) + for s in ss: + rt.add_edge(inf_node_id, s) + tmp = inf_node_id + rt.remove_node(curr_node) + return + + + def _remove_undiagnosed(self, rt, curr_node): + + preds = list(rt.predecessors(curr_node)) + succs = list(rt.successors(curr_node)) + + assert len(preds) == 1 and len(succs) == 1 + + rt.add_edge(preds[0], succs[0]) + rt.remove_node(curr_node) + + return From b9805674201034e2c2a1cd67d435f18b3726bf34 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 16:23:19 +0100 Subject: [PATCH 07/10] make size of test so edge case sometimes hit --- tests/test_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 2bc123005..dafb69c3f 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -188,11 +188,11 @@ def test_recontree(): test_label = 'my_test' tp = cv.test_prob(label=test_label, symp_prob=0.9) - sim = cv.Sim(interventions = tp) + sim = cv.Sim(pop_size = 200, interventions = tp) sim.run() try: - recontree = sim.make_recontree(test_label, prop_seq=0.5) + recontree = sim.make_recontree(test_label, prop_seq=0.9) recontree.to_newick() except ImportError as E: print(f'Could not test conversion to networkx ({str(E)})') From c5bca747d2ad37e28060e95eb666f314f7bfa046 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 16:49:49 +0100 Subject: [PATCH 08/10] fix edge case when dates collide --- covasim/analysis.py | 71 +++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/covasim/analysis.py b/covasim/analysis.py index 1888deced..b1d08393d 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -2342,7 +2342,13 @@ def _split_node(self, rt, curr_node, pred, succs): return def _split_diagnosed(self, rt, curr_node, pred, succs): - + ''' + Args: + rt: current tree object. + curr_node: the node to be split. + pred: the predecessor of the current node. + succs: a list of the successor nodes of the current node. + ''' if self.verbose == 2: print("splitting the node of {curr_node}".format(curr_node=curr_node)) @@ -2350,37 +2356,38 @@ def _split_diagnosed(self, rt, curr_node, pred, succs): inf_dates = list(set(self._infection_date[s] for s in succs)) inf_dates.sort() - if diag_date in inf_dates: - raise NotImplemented("case of diagnosis occurring on the same day as infection.") - else: - pre_diag_inf_dates = filter(lambda d: d < diag_date, inf_dates) - post_diag_inf_dates = filter(lambda d: d > diag_date, inf_dates) - - tmp = pred - for inf_d in pre_diag_inf_dates: - ss = filter(lambda s: self._infection_date[s] == inf_d, succs) - inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) - rt.add_node(inf_node_id) - rt.add_edge(tmp, inf_node_id) - for s in ss: - rt.add_edge(inf_node_id, s) - tmp = inf_node_id - - nid = "diagnosis of {n} on {d}".format(n=curr_node, d=diag_date) - rt.add_node(nid) - rt.add_edge(tmp, nid) - tmp = nid - - for inf_d in post_diag_inf_dates: - ss = filter(lambda s: self._infection_date[s] == inf_d, succs) - inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) - rt.add_node(inf_node_id) - rt.add_edge(tmp, inf_node_id) - for s in ss: - rt.add_edge(inf_node_id, s) - tmp = inf_node_id - - rt.remove_node(curr_node) + # Section 2.1 of the covasim paper says that in interventions are + # applied before transmission events on each day. Hence when a + # diagnosis and an infection occur on the same day, we can assume that + # the diagnosis happened first. + pre_diag_inf_dates = filter(lambda d: d < diag_date, inf_dates) + post_diag_inf_dates = filter(lambda d: d >= diag_date, inf_dates) + + tmp = pred + for inf_d in pre_diag_inf_dates: + ss = filter(lambda s: self._infection_date[s] == inf_d, succs) + inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) + rt.add_node(inf_node_id) + rt.add_edge(tmp, inf_node_id) + for s in ss: + rt.add_edge(inf_node_id, s) + tmp = inf_node_id + + nid = "diagnosis of {n} on {d}".format(n=curr_node, d=diag_date) + rt.add_node(nid) + rt.add_edge(tmp, nid) + tmp = nid + + for inf_d in post_diag_inf_dates: + ss = filter(lambda s: self._infection_date[s] == inf_d, succs) + inf_node_id = "infection by {n} on {inf_d}".format(n=curr_node, inf_d=inf_d) + rt.add_node(inf_node_id) + rt.add_edge(tmp, inf_node_id) + for s in ss: + rt.add_edge(inf_node_id, s) + tmp = inf_node_id + + rt.remove_node(curr_node) return From 5f22c6ffb84444c598756f836cd9a88f5d726c95 Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 28 Jul 2021 16:52:28 +0100 Subject: [PATCH 09/10] update example in comment so it runs quickly --- covasim/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covasim/analysis.py b/covasim/analysis.py index b1d08393d..ec81646a3 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -2125,7 +2125,7 @@ def to_newick(self): test_label = "my_testing" testing = cv.test_prob(label=test_label, symp_prob=0.5) - sim = cv.Sim(interventions = testing) + sim = cv.Sim(pop_size=500, interventions = testing) sim.run() rt = sim.make_recontree(test_label, 0.5) rt.to_newick() From 052aeca5f085e2826cf95302f65aaf7e7a7f9eea Mon Sep 17 00:00:00 2001 From: Alex Zarebski Date: Wed, 11 Aug 2021 17:59:39 +0100 Subject: [PATCH 10/10] documentation --- covasim/analysis.py | 17 +++++++++++++++++ docs/genetics.org | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 docs/genetics.org diff --git a/covasim/analysis.py b/covasim/analysis.py index ec81646a3..e1224a88b 100644 --- a/covasim/analysis.py +++ b/covasim/analysis.py @@ -2034,6 +2034,13 @@ class ReconTree(Analyzer): prob_seq (float): the probability of including a diagnosed individual. verbose (int): the level of verbosity while computing from 0 to 2 + Attributes: + reconstructed_tree (dict): a dictionary containing the reconstructed + trees index by the seed infection. + newick (dict): a dictionary containing the Newick string representations + indexed by the seed infection. Available after a call to the + `to_newick` method. + **Example**:: test_label = "my_testing" @@ -2042,6 +2049,12 @@ class ReconTree(Analyzer): sim.run() rt = sim.make_recontree(test_label, 0.5) + Todo: + * Provide an option to contruct an ete3 tree. + * Use node and edge attributes rather than the intermediate string + encoding for performance. + * Avoid the hard-coded safety limit of 1000 iterations in the traversal + depth. ''' def __init__(self, people, tt, test_label, prob_seq, verbose=0, **kwargs): @@ -2259,11 +2272,15 @@ def _leaf_to_root(self, seed_uid): next_people.add(np) if len(next_people) > 0: + # there are still people that need to be processed so we add + # them to the result and loop back. for p in next_people: result.add(p) curr_people, next_people = next_people, set() loop_counter += 1 else: + # there is no-one left to process so we can break out of the + # loop and return the result. break return result diff --git a/docs/genetics.org b/docs/genetics.org new file mode 100644 index 000000000..77d04bd66 --- /dev/null +++ b/docs/genetics.org @@ -0,0 +1,40 @@ +#+title: Phylogenetics + +To compile this document to RST use the following command. + +#+begin_src sh +pandoc --from=org --to=rst --output=genetics.rst genetics.org +#+end_src + +* Epidemic trees + +A simulation can be represented by a tree in several ways. The /transmission +tree/ is more popular in epidemiology while the /reconstructed tree/ is the main +object of study in phylodynamics ([[https://doi.org/10.1534/genetics.113.154856][Ypma /et al/ (2013)]]). + +** =TransTree= + +The =TransTree= class corresponds to a transmission tree, a representation of a +simulation where each node is an infected individual and edges correspond to +infection. This tree represents who infected whom in the whole simulation, with +attributes of the nodes encoding when these events occurred. Since there can be +multiple seed infections this is actually a forest rather than a single tree. + +** =ReconTree= + +The =ReconTree= class corresponds to a reconstructed tree, a representation of +the reconstructed tree generated from a random sample of the diagnosed +individuals (assuming there is sufficient genetic signal to correctly resolve +the genealogy). In this tree, internal nodes correspond to infection events and +leaf nodes correspond to sequenced viral samples. Due to the assumptions of +discrete time (days) and the possibility of post-diagnosis infections in covasim +the reconstructed tree can be [[https://en.wikipedia.org/wiki/Phylogenetic_tree#Bifurcating_versus_multifurcating][multifurcating]] and have [[https://doi.org/10.1371/journal.pcbi.1003919][sampled ancestors]]. Since +there can be multiple seed infections the reconstructed tree is actually a +forest. + +The computation of the reconstructed tree requires two traversals of the +transmission tree: the first goes from the leaves to root to determine which +nodes are relevant to the reconstructed tree, ie which have sequenced +descendents, the second traversal (on a copy of the transmission tree containing +just those relevant nodes) goes from root to leaf splitting each node into +multiple nodes representing infection and testing events.