This repository has been archived by the owner on Sep 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaml_random_forest.py
115 lines (99 loc) · 4.45 KB
/
aml_random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""AML Random Forest
File on optimizing Random Forest with hyperparameters. To put to personal use,
change save_path variables, insert your own data set (last column is variable to predict)
"""
!pip install scikit-optimize
#Basic Bayesion optimizer init
#https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html
import sys
import numpy as np
import matplotlib.pyplot as plt
from skopt.plots import plot_gaussian_process
from skopt import gp_minimize
from sklearn.ensemble import RandomForestRegressor
from skopt.space import Real, Integer, Categorical
import xgboost as xgb
save_path = "PATH"
numberOfCalls = 50 #Calls to objective function
numberOfRuns = 10 #Number of total runs to do (Only >1 for statistics)
def generateTrainTestSets(dataset):
np.random.shuffle(dataset)
num_rows, num_cols = dataset.shape
split = int((num_rows/100) * 80)
X_train, X_test = dataset[:split,0:num_cols-1], dataset[split:,0:num_cols-1]
Y_train, Y_test = dataset[:split,num_cols-1], dataset[split:,num_cols-1]
return X_train, X_test, Y_train, Y_test
#Select which data set is to be used
dataset = 2
if dataset == 1:
#Load the KIN8NM dataset https://www.openml.org/d/189
KIN8NM_dataset = np.load('KIN8NM.npy')
xtrain, xtest, ytrain, ytest = generateTrainTestSets(KIN8NM_dataset)
if dataset == 2:
#Load ASP-POTASSCO dataset https://www.openml.org/d/41704
CCF_dataset = np.load('ASP_POTASSCO.npy')
xtrain, xtest, ytrain, ytest = generateTrainTestSets(CCF_dataset)
if dataset == 3:
#LOAD ??
print("Invalid Selection")
#n_estimators specifies the size of the forest to be trained; it is converted to num_parallel_tree, instead of the number of boosting rounds
#learning_rate is set to 1 by default
#colsample_bynode and subsample are set to 0.8 by default
def xgb_rf_error(params):
#xtrain, xtest, ytrain, ytest = generateTrainTestSets(ISOLET_dataset)
xgb_rf = xgb.XGBRFRegressor(max_depth=1000000, learning_rate=params[0],
n_estimators=params[1],
colsample_bynode=params[2],
subsample=params[3], random_state=42).fit(xtrain, ytrain)
error = 0.0
for i,e in enumerate(xtest):
res = xgb_rf.predict(e.reshape(1, -1))[0]
error += abs(ytest[i] - res)
return error
#Do XGB Random Forest Bayesian Optimization
xgb_rf_space = [Real(0, 1, name = "learning_rate"),
Integer(1, 1024, name='est'),
Real(0, 1, name = "csbn"),
Real(0, 1, name = "ss")
]
for run in range(numberOfRuns):
xgb_res = gp_minimize(xgb_rf_error, # the function to minimize
xgb_rf_space, # the bounds on each dimension of x
acq_func="EI", # the acquisition function
n_calls=numberOfCalls, # the number of evaluations of f
n_initial_points=1, # the number of random initialization points
verbose=True) # the random seed
fn = "XGB_RF-Result-RUN-" + str(run + 1)
np.save(save_path + fn, np.array(xgb_res.func_vals))
#------------------------------------------------------------------------------
# RANDOM SEARCH BELOW
# Only use for statistics
# -----------------------------------------------------------------------------
#Do Random search
from sklearn.model_selection import RandomizedSearchCV
import time
import random
from random import randrange
prev = time.time()
for run in range(numberOfRuns):
random_res = []
for iterations in range(numberOfCalls):
#rSearch = RandomizedSearchCV(RandomForestRegressor(), rParams, n_iter=1)
lr = random.uniform(0, 1)
trees = random.randrange(0, 1024) + 1
cs = random.uniform(0,1)
ss = random.uniform(0,1)
rSearch = xgb.XGBRFRegressor(max_depth=1000000, learning_rate=lr, n_estimators=trees, colsample_bynode=cs, subsample=ss, random_state=42).fit(xtrain, ytrain)
error = 0.0
for i, e in enumerate(xtest):
pred = rSearch.predict(e.reshape(1, -1))[0]
if int(round(pred)) != int(ytest[i]):
error += abs(ytest[i] - pred)
random_res.append(error)
now = time.time()
print("Done with iteration: " + str(iterations) + " in " + str(now - prev))
print("Error: " + str(error))
prev = now
fn = "Random_RF-Result-RUN-" + str(run+1)
np.save(save_path + fn, np.array(random_res))