-
Notifications
You must be signed in to change notification settings - Fork 0
/
SCBD.py
136 lines (102 loc) · 3.74 KB
/
SCBD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'''
Romain Derelle (Imperial College London)
contact: [email protected]
'''
import sys
import os
import csv
import collections
import math
import statistics
import kneed
import argparse
def parse_args():
# define and parse command-line arguments
parser = argparse.ArgumentParser(description=' SCBD-LCBD analysis', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n')
common = parser.add_argument_group()
common.add_argument('-m', help='path to the normalised matrix file [required]', metavar='')
common.add_argument('-h','-help', action="help", help="show this help message and exit")
args = parser.parse_args()
return args.m
file_matrix = parse_args()
## sanity check
if not file_matrix:
sys.exit('\n ERROR: please specify a matrix file (-m option)\n\n')
elif not os.path.isfile(file_matrix):
sys.exit('\n ERROR: the path to the matrix file is incorrect\n\n')
## get all data from CSV file
print('read input matrix')
nb_line = 0
counter_feature = 0
feature_names = dict()
feature_values = dict()
raw_data = dict()
file_content = csv.reader(open(file_matrix), delimiter=',')
for line in file_content:
nb_line += 1
# extract column names
if nb_line == 1:
sample_names = dict()
total = dict()
for i,k in enumerate(line):
sample_names[i] = k
total[sample_names[i]] = 0
# extract feature values
elif nb_line > 1:
counter_feature += 1
name = line[0]
feature_names[counter_feature] = name
# get data
raw_data[counter_feature] = line[1:]
feature_values[counter_feature] = dict()
for n in range(1, len(line)):
nb_reads = line[n]
sample = sample_names[n]
feature_values[counter_feature][sample] = float(nb_reads)
total[sample] += float(nb_reads)
## perform Hellinger transformation (square root of the fraction)
print('perform Hellinger transformation')
hellinger = collections.defaultdict(dict)
for feature in feature_values:
for sample in feature_values[feature]:
nn = math.sqrt(feature_values[feature][sample] / total[sample])
hellinger[feature][sample] = nn
## get mean value for each feature
print('calculate means and SS value')
feature_means = dict()
for feature in hellinger:
feature_means[feature] = statistics.mean([v for v in hellinger[feature].values()])
## build S matrix
S_matrix = collections.defaultdict(dict)
for feature in hellinger:
for sample in hellinger[feature]:
S_matrix[sample][feature] = math.pow(hellinger[feature][sample] - feature_means[feature], 2) ## invert rows and columns
## calculate SS value
SS_total = 0
for sample in S_matrix:
for feature in S_matrix[sample]:
SS_total += S_matrix[sample][feature]
## calculate SCBD value for each feature
print('calculate SCBD values')
SCBD = dict()
for feature in feature_names:
tmp_sum = 0
for sample in S_matrix:
tmp_sum += S_matrix[sample][feature]
SCBD[feature] = tmp_sum / SS_total
## sort the SCBD by values (decreasing order)
l_sorted_SCBD = list()
for key, value in sorted(SCBD.items(), key=lambda x: x[1], reverse=True):
l_sorted_SCBD.append(key)
## save SCBD values
print('-> save SCBD values to file')
out_SCBD = open('out_SCBD_values.tsv', 'w+')
for feature in l_sorted_SCBD:
out_SCBD.write(feature_names[feature] + ' ' + str(SCBD[feature]) + '\n')
out_SCBD.close()
## save sorted matrix
print('-> save sorted matrix to file')
out_matrix = open('out_SCBD_matrix.csv', 'w+')
for feature in l_sorted_SCBD:
out_matrix.write(feature_names[feature] + ',' + ','.join(raw_data[feature]) + '\n')
out_matrix.close()