-
Notifications
You must be signed in to change notification settings - Fork 1
/
Simulate_pan_genome.py
131 lines (110 loc) · 3.59 KB
/
Simulate_pan_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Script for simulating pan-genomes. Outputs a Roary-like gene_presence_absence.csv and a Traits file
# Use: Simulate_pan_genome.py causal_gene_penetrance desired_number_of_samples
# Author: Ola Brynildsrud
import random
import copy
import sys
class SimStrain:
def __init__(self, genome=None, core_genes=3000, pan_genes=6000, trait=0):
if genome is None:
self.genome = self.create_genome(core_genes, pan_genes)
else:
self.genome = genome
self.trait = trait
def mutate(self):
try:
for gene in self.genome:
if random.random() <= self.genome[gene].mutchance:
self.genome[gene].switchstate()
if self.genome[gene].name == "causal":
if self.genome["causal"].present:
if random.random() <= penetrance:
self.trait = 1
else:
self.trait = 0
else:
if random.random() <= penetrance:
self.trait = 0
else:
self.trait = 1
except TypeError:
print(self.genome)
def create_genome(self,core_genes=3000, pan_genes=6000):
genome = {}
for x in xrange(core_genes):
genome["gene_" + str(x)] = Gene("gene_" + str(x), mutchance=0, present=1, causal=False)
for y in xrange(core_genes, (core_genes + pan_genes)):
genome["gene_" + str(y)] = Gene("gene_" + str(y), mutchance=(random.random()/100), present=0, causal=False)
genome["causal"] = Gene("causal", mutchance=0.01,present=0,causal=True)
return genome
def switchTrait(self):
if self.trait == 1:
self.trait = 0
else:
self.trait = 1
class Gene:
def __init__(self, name, mutchance,present=0,causal=False):
self.name = name
if mutchance is None:
self.mutchance = random.random() / 100
else:
self.mutchance = mutchance
self.causal = causal
self.present=present
def switchstate(self):
if self.present == 1:
self.present = 0
else:
self.present = 1
def main():
root = SimStrain()
Genome_collection = {}
Genome_collection["root"] = root
Number_of_completed_genomes = 1
add_these_genomes = []
while Number_of_completed_genomes < number_of_tips:
for genome in Genome_collection:
Genome_collection[genome].mutate()
if random.random() <= 0.01:
# MUTATE AND BRANCH. Check success to determine which genome will mutate next
new_genome = copy.deepcopy(Genome_collection[genome])
new_genome.mutate()
add_these_genomes.append(new_genome)
# Add the additional genomes
for newgenome in add_these_genomes:
try:
Genome_collection[strain_names.pop()] = newgenome
except IndexError:
break
Number_of_completed_genomes += 1
add_these_genomes = []
print Number_of_completed_genomes
# Write genome presence/absence matrix and phenotype
with open("Gene_presence_absence.csv", "w") as gpa:
with open("Trait.csv","w") as t:
# Header line:
header = ","
for Strain in sorted(Genome_collection.keys()):
header = header + Strain + ","
header = header.rstrip(",") + "\n"
gpa.write(header)
# Gene presence absence matrix
for gene in sorted(root.genome.keys()):
line = gene + ","
for Strain in sorted(Genome_collection.keys()):
line = line + str(Genome_collection[Strain].genome[gene].present) + ","
line = line.rstrip(",") + "\n"
gpa.write(line)
# Traits file
t.write(",Trait\n")
for Strain in sorted(Genome_collection.keys()):
line = Strain + "," + str(Genome_collection[Strain].trait) + "\n"
t.write(line)
if __name__ == '__main__':
# Set parameters
penetrance = float(sys.argv[1])
number_of_tips = int(sys.argv[2])
strain_names = [str(x) for x in xrange(number_of_tips)]
main()