forked from gussow/acr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
acr.py
72 lines (58 loc) · 1.89 KB
/
acr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
###############################################################################
# Imports ---------------------------------------------------------------------
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
# Constants --------------------------------------------------------------------
PRED_DATA = "sample_data_predict.txt"
TRAINING_DATA = "sample_data_training.txt"
# Functions ------------------------------------------------------------------
def read_data(path):
ret = pd.read_csv(path, sep="\t")
ret = ret[sorted(ret.columns)]
return ret
# Classes ---------------------------------------------------------------------
class AcrModel:
"""
Model for predicting Acrs.
"""
def __init__(self):
self.__model = None
def fit(self, X):
"""
Fit a random forest model.
"""
weights = X.weight.tolist()
y = X.y.tolist()
X = X.drop(columns=["y", "weight", "name"])
self.__model = ExtraTreesClassifier(
n_estimators=1000,
random_state=123890,
)
self.__model.fit(
X, y,
sample_weight=weights,
)
def score(self, X):
"""
Returns scores for input Acr candidates.
"""
names = X.name.tolist()
X = X.drop(columns="name")
scores = {
x: y for x, y in
zip(names, self.__model.predict_proba(X)[:, 1])
}
names = sorted(names, key=lambda x: int(x.split("_")[-1]), reverse=True)
return zip(
names,
[scores[x] for x in names]
)
# Main ------------------------------------------------------------------------
# Train the model
X = read_data(TRAINING_DATA)
model = AcrModel()
model.fit(X)
# Test the model
X = read_data(PRED_DATA)
print("\n".join(["{}\t{}".format(x, y) for x, y in model.score(X)]))