-
Notifications
You must be signed in to change notification settings - Fork 2
/
Get_homoplasy_stats.py
executable file
·75 lines (52 loc) · 2.41 KB
/
Get_homoplasy_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
import os, sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from modules.Si_nexus import *
from modules.Si_general import *
from modules.Si_SeqIO import *
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pylab
tabfile=open(sys.argv[1], "rU")
SNPs=[]
for line in tabfile:
words= line.strip().split()
if len(words)==3 and words[1]=="SNP":
SNPs.append({})
SNPs[-1]["location"]=words[2]
elif len(words)>1 and words[1][0]=="/":
qualifier=words[1][1:].split("=")[0].strip()
value=' '.join(words[1:]).replace("/"+qualifier+"=","")
SNPs[-1][qualifier]=value.replace('"',"")
locations=set([])
homoplasylocations=set([])
nonhomlocations=set([])
SNPcount=0
homoplasycount=0
print '\t'.join(['Number of SNP bases', 'Number of non-homoplasy bases', 'number of homoplasy bases', '% of bases that are homoplasies', 'Total number of SNPs', 'Total number of non-homoplasic SNPs', 'Total number of homoplasic SNPs', '% of SNPs that are homoplasic'])
for SNP in SNPs:
homoplasy=False
if "homoplasy" in SNP:
homoplasy=True
if not int(SNP["location"]) in locations:
locations.add(int(SNP["location"]))
SNPcount+=1
if homoplasy and not int(SNP["location"]) in homoplasylocations:
homoplasylocations.add(int(SNP["location"]))
elif homoplasy:
homoplasycount+=1
elif not int(SNP["location"]) in nonhomlocations:
nonhomlocations.add(int(SNP["location"]))
print '\t'.join(map(str,[len(locations), len(locations)-len(homoplasylocations), len(homoplasylocations), (float(len(homoplasylocations))/len(locations))*100, SNPcount, SNPcount-homoplasycount, homoplasycount, (float(homoplasycount)/SNPcount)*100]))
print 'Note: For total non-homoplasies, total homoplasies and % of SNPs that are homoplasic, for each site where homoplasy occurs, one SNP is considered non-homoplasic and the remainder are counted as homoplasic'
legend_colours=[(pylab.Rectangle((0, 0), 1, 1, fc="b")), (pylab.Rectangle((0, 0), 1, 1, fc="r"))]
n, bins, patches = plt.hist(map(list,[nonhomlocations, homoplasylocations]), bins=50, color=["b", "r"], histtype="barstacked")
plt.legend(legend_colours, ["Non homoplasy sites", "Homoplasy sites"])
plt.xlabel("Genome position")
plt.ylabel("SNP Frequency")
plt.xlim(0,1100000)
plt.show()
#print missinglist