-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathNHL_knn.py
150 lines (107 loc) · 3.66 KB
/
NHL_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Wed May 11 14:55:26 2016
@author: JosephNelson
"""
import pandas as pd
NHL = pd.read_csv('https://raw.githubusercontent.com/josephnelson93/GA-DSI/master/NHL_Data_GA.csv')
# check it out
NHL.head()
NHL.describe()
NHL.shape
# what is rank?
NHL.Rank # ok...
NHL.Rank.nunique() # how many diff values?
NHL.Rank.unique() # and what are they, anyway?
NHL.isnull().sum()
'''
K-Nearest Neighbors Classification
'''
# store feature matrix in "X"
feature_cols = ['CF%', 'GF', 'Sh%', 'PDO']
X = NHL[feature_cols]
# store response vector in "y"
y = NHL.Rank
# check X's type
print type(X)
print type(X.values)
# check y's type
print type(y)
print type(y.values)
# check X's shape (n = number of observations, p = number of features)
print X.shape
# check y's shape (single dimension with length n)
print y.shape
from sklearn.neighbors import KNeighborsClassifier
# make an instance of a KNeighborsClassifier object
knn = KNeighborsClassifier(n_neighbors=1)
type(knn)
print knn
knn.fit(X, y)
# predict the response values for the observations in X ("test the model")
knn.predict(X)
# store the predicted response values
y_pred_class = knn.predict(X)
# compute classification accuracy
from sklearn import metrics
print metrics.accuracy_score(y, y_pred_class)
# what are we observing from the above accuracy?
'''
Train, test, split
'''
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
# STEP 1: split X and y into training and testing sets (using random_state for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)
# STEP 2: train the model on the training set (using K=1)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
# STEP 3: test the model on the testing set, and check the accuracy
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)
# test with 50 neighbors
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)
# test with 64 neighbors
knn = KNeighborsClassifier(n_neighbors=64)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)
# examine the class distribution
y_test.value_counts()
# compute null accuracy
y_test.value_counts().head(1) / len(y_test)
# calculate TRAINING ERROR and TESTING ERROR for K=1 through 64
k_range = range(1, 64)
training_error = []
testing_error = []
for k in k_range:
# instantiate the model with the current K value
knn = KNeighborsClassifier(n_neighbors=k)
# calculate training error
knn.fit(X, y)
y_pred_class = knn.predict(X)
training_accuracy = metrics.accuracy_score(y, y_pred_class)
training_error.append(1 - training_accuracy)
# calculate testing error
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
testing_accuracy = metrics.accuracy_score(y_test, y_pred_class)
testing_error.append(1 - testing_accuracy)
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# create a DataFrame of K, training error, and testing error
column_dict = {'K': k_range, 'training error':training_error, 'testing error':testing_error}
df = pd.DataFrame(column_dict).set_index('K').sort_index(ascending=False)
df.head()
# plot the relationship between K (HIGH TO LOW) and TESTING ERROR
df.plot(y='testing error')
plt.xlabel('Value of K for KNN')
plt.ylabel('Error (lower is better)')
plt.savefig('KNN.png')
#display best k for minimum error
df.sort('testing error').head()