forked from GFA-RDIP/Morgana
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nodes.py
173 lines (134 loc) · 4.27 KB
/
nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pandas as pd
import numpy as np
from math import sqrt
from scipy.stats import chi2_contingency
from functools import reduce
def is_binary(col):
return is_categorical(col, 2)
def is_categorical(col, cats=5):
try:
valid = col[~np.isnan(col)]
values = set(valid)
return len(values) <= cats
except TypeError:
return False
def filter_binary_cols(df):
return filter_categorical_cols(df, 2)
def filter_categorical_cols(df, cats=5):
cols = [col for col in df.columns if is_categorical(df[col], cats)]
return cols
def sort_cols(df, cols):
# Sort columns by number of non-NA entries
return sorted(cols, key=lambda x: length(df[x]), reverse=True)
def read_data(path):
return pd.read_csv(path)
def column(m, col):
return m[col]
def norm(v):
return sqrt(np.dot(v, v))
def contingency_table(x, y):
xvals = list(set(x))
yvals = list(set(y))
contingency = np.array([np.zeros(len(xvals)) for _ in range(len(yvals))])
for xv, yv in zip(x, y):
xi = xvals.index(xv)
yi = yvals.index(yv)
contingency[yi][xi] += 1
return contingency
def corr_str(x, y):
contingency = contingency_table(x, y)
chi2, p, dof, expected = chi2_contingency(contingency)
return 1 - p
def correlation(v1, v2, f=corr_str):
"""v1 v2 are numpy vectors"""
# Extract indices where they are defined
validIndex = np.logical_and(~np.isnan(v1), ~np.isnan(v2))
v1 = v1[validIndex]
v2 = v2[validIndex]
# What to do if one is zero?
return f(v1, v2)
def correlation_matrix(df, cols=None, binop=correlation):
"""df is a dataframe"""
if not cols:
cols = df.columns
answer = []
for col1 in cols:
row = []
for col2 in cols:
if col1 == col2:
row.append(np.double(0))
else:
row.append(binop(df[col1], df[col2]))
answer.append(row)
return answer
def size(v):
v = v[~np.isnan(v)]
values = set(v)
if 0.0 in values:
return sum(v[v != 0.0]) / len(v)
else:
# Not numeric; just ignore the first thing seen for
# demo purposes
return sum(v[v != v[0]]) / len(v)
def length(v, w=None):
if w is None:
w = v
vt = ~np.isnan(v)
wt = ~np.isnan(w)
valid = np.logical_and(vt, wt)
return sum(valid)
def network_layout(corrs, sizes, labels, l):
answer = {'nodes': [], 'edges': []}
for id, size in enumerate(sizes):
answer['nodes'].append({'id': 'n%s' % id,
'size': size,
'label': labels[id]})
counter = 0
for i, row in enumerate(corrs):
for j, corr in enumerate(row):
# Symmetric: only do one direction
if i < j:
answer['edges'].append(
{'id': 'e%s' % counter,
'source': 'n%s' % i,
'target': 'n%s' % j,
'num': float(l[i][j]),
'weight': corr})
counter += 1
return answer
def data_network(df, cols, labels=None):
labels = labels or cols
df = df[cols]
corr = correlation_matrix(df, cols)
l = correlation_matrix(df, cols, length)
sizes = [size(df[col]) for col in cols]
network = network_layout(corr, sizes, labels, l)
return network
def data_tree(df, cols, labels, name="root", variable=None, value=None):
"Only use on categorical data"
if cols:
col = cols[0]
rest = cols[1:]
vals = set(v for v in df[col] if not np.isnan(v))
children = []
for val in vals:
children.append(
data_tree(df[df[col] == val], rest, labels, labels[col][val], col, val))
ans = {'name': name, 'children': children}
if variable:
ans['_variable'] = variable
if value:
ans['_value'] = value
return ans
else:
return {'name': name, 'size': len(df)}
def decorate_id_tree(tree, id=0):
tree['id'] = id
id += 1
if tree.get('children'):
for child in tree['children']:
id = decorate_id_tree(child, id)
return id
def _assign_attr_default(col, attr, vals):
# Warning: Type hack
return vals.get(int(float(attr)), "%s: %s" % (col, attr))