-
Notifications
You must be signed in to change notification settings - Fork 0
/
genie3.py
136 lines (96 loc) · 4.89 KB
/
genie3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#In[0]
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import numpy as np
import time
from operator import itemgetter
from multiprocessing import Pool
from scipy.spatial.distance import pdist, squareform
#In[1]
def GENIE3(expr_data,gene_names=None,regulators='all',tree_method='RF',K='sqrt',ntrees=1000,nthreads=1):
'''Computation of tree-based scores for all putative regulatory links.
Parameters
----------
expr_data: numpy array
Array containing gene expression values. Each row corresponds to a condition and each column corresponds to a gene.
gene_names: list of strings, optional
List of length p, where p is the number of columns in expr_data, containing the names of the genes. The i-th item of gene_names must correspond to the i-th column of expr_data.
default: None
regulators: list of strings, optional
List containing the names of the candidate regulators. When a list of regulators is provided, the names of all the genes must be provided (in gene_names). When regulators is set to 'all', any gene can be a candidate regulator.
default: 'all'
tree-method: 'RF' or 'ET', optional
Specifies which tree-based procedure is used: either Random Forest ('RF') or Extra-Trees ('ET')
default: 'RF'
K: 'sqrt', 'all' or a positive integer, optional
Specifies the number of selected attributes at each node of one tree: either the square root of the number of candidate regulators ('sqrt'), the total number of candidate regulators ('all'), or any positive integer.
default: 'sqrt'
ntrees: positive integer, optional
Specifies the number of trees grown in an ensemble.
default: 1000
nthreads: positive integer, optional
Number of threads used for parallel computing
default: 1
Returns
-------
An array in which the element (i,j) is the score of the edge directed from the i-th gene to the j-th gene. All diagonal elements are set to zero (auto-regulations are not considered). When a list of candidate regulators is provided, the scores of all the edges directed from a gene that is not a candidate regulator are set to zero.
'''
print('Tree method: {}, K: {}, Number of trees: {} \n'.format(tree_method,K,ntrees))
ngenes = expr_data.shape[1]
# Get the indices of the candidate regulators
if regulators == 'all':
input_idx = list(np.arange(ngenes))
else:
input_idx = [i for i, gene in enumerate(gene_names) if gene in regulators]
# Learn an ensemble of trees for each target gene, and compute scores for candidate regulators
VIM = np.zeros((ngenes,ngenes))
if nthreads > 1:
#print('running jobs on %d threads' % nthreads)
input_data = list()
for i in range(ngenes):
input_data.append( [expr_data,i,input_idx,tree_method,K,ntrees] )
pool = Pool(nthreads)
alloutput = pool.map(wr_GENIE3_single, input_data)
for (i,vi) in alloutput:
VIM[i,:] = vi
else:
#print('running single threaded jobs')
for i in range(ngenes):
if (i+1)%20==0:
print('Gene %d/%d...' % (i+1,ngenes))
vi = GENIE3_single(expr_data,i,input_idx,tree_method,K,ntrees)
VIM[i,:] = vi
return np.transpose(VIM)
def wr_GENIE3_single(args):
return([args[1], GENIE3_single(args[0], args[1], args[2], args[3], args[4], args[5])])
def GENIE3_single(expr_data,output_idx,input_idx,tree_method,K,ntrees):
ngenes = expr_data.shape[1]
# Expression of target gene
output = expr_data[:,output_idx]
# Normalize output data
if np.std(output) == 0:
output = np.zeros(len(output))
else:
output = output / np.std(output)
# Remove target gene from candidate regulators
input_idx = input_idx[:]
if output_idx in input_idx:
input_idx.remove(output_idx)
expr_data_input = expr_data[:,input_idx]
# Parameter K of the tree-based method
if (K == 'all') or (isinstance(K,int) and K >= len(input_idx)):
max_features = "auto"
else:
max_features = K
if tree_method == 'RF':
treeEstimator = RandomForestRegressor(n_estimators=ntrees,max_features=max_features)
elif tree_method == 'ET':
treeEstimator = ExtraTreesRegressor(n_estimators=ntrees,max_features=max_features)
# Learn ensemble of trees
treeEstimator.fit(expr_data_input,output)
# Compute importance scores
importances = [e.tree_.compute_feature_importances(normalize=False) for e in treeEstimator.estimators_]
importances = np.asarray(importances)
feature_importances = np.sum(importances,axis=0) / len(treeEstimator)
vi = np.zeros(ngenes)
vi[input_idx] = feature_importances
return vi