-
Notifications
You must be signed in to change notification settings - Fork 0
/
Boston housing prices prediction
208 lines (157 loc) · 7.49 KB
/
Boston housing prices prediction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Importing a few necessary libraries
import numpy as np
import matplotlib.pyplot as pl
from sklearn import datasets
from sklearn import cross_validation
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
# Make matplotlib show our plots inline (nicely formatted in the notebook)
%matplotlib inline
# Create our client's feature set for which we will be predicting a selling price
CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]
# Load the Boston Housing dataset into the city_data variable
city_data = datasets.load_boston()
# Initialize the housing prices and housing features
housing_prices = city_data.target
housing_features = city_data.data
#print housing_prices
print "Boston Housing dataset loaded successfully!"
# Number of houses in the dataset
print housing_features.shape
total_houses = housing_features.shape[0]
# Number of features in the dataset
total_features = housing_features.shape[1]
# Minimum housing value in the dataset
minimum_price = np.min(housing_prices)
# Maximum housing value in the dataset
maximum_price = np.max(housing_prices)
# Mean house value of the dataset
mean_price = np.mean(housing_prices)
# Median house value of the dataset
median_price = np.median(housing_prices)
# Standard deviation of housing values of the dataset
std_dev = np.std(housing_prices)
# Show the calculated statistics
print "Boston Housing dataset statistics (in $1000's):\n"
print "Total number of houses:", total_houses
print "Total number of features:", total_features
print "Minimum house price:", minimum_price
print "Maximum house price:", maximum_price
print "Mean house price: {0:.3f}".format(mean_price)
print "Median house price:", median_price
print "Standard deviation of house price: {0:.3f}".format(std_dev)
# Put any import statements you need for this code block here
from sklearn.cross_validation import train_test_split
def shuffle_split_data(X, y):
# Shuffle and split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42)
# Return the training and testing data subsets
return X_train, y_train, X_test, y_test
# Test shuffle_split_data
try:
X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
print "Successfully shuffled and split the data!"
except:
print "Something went wrong with shuffling and splitting the data."
# Put any import statements you need for this code block here
def performance_metric(y_true, y_predict):
""" Calculates and returns the total error between true and predicted values
based on a performance metric chosen by the student. """
error = metrics.mean_squared_error(y_true, y_predict)
return error
# Test performance_metric
try:
total_error = performance_metric(y_train, y_train)
print "Successfully performed a metric calculation!"
except:
print "Something went wrong with performing a metric calculation."
# Put any import statements you need for this code block
from sklearn import metrics
from sklearn import grid_search
def fit_model(X, y):
""" Tunes a decision tree regressor model using GridSearchCV on the input data X
and target labels y and returns this optimal model. """
# Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# Set up the parameters we wish to tune
parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}
# Make an appropriate scoring function
scoring_function =metrics.make_scorer(performance_metric,greater_is_better=False)
# Make the GridSearchCV object
reg = grid_search.GridSearchCV(regressor,parameters,scoring_function)
# Fit the learner to the data to obtain the optimal model with tuned parameters
reg.fit(X, y)
# Return the optimal model
return reg.best_estimator_
# Test fit_model on entire dataset
try:
reg = fit_model(housing_features, housing_prices)
print "Successfully fit a model!"
except:
print "Something went wrong with fitting a model."
def learning_curves(X_train, y_train, X_test, y_test):
""" Calculates the performance of several models with varying sizes of training data.
The learning and testing error rates for each model are then plotted. """
print "Creating learning curve graphs for max_depths of 1, 3, 6, and 10. . ."
# Create the figure window
fig = pl.figure(figsize=(10,8))
# We will vary the training set size so that we have 50 different sizes
sizes = np.rint(np.linspace(1, len(X_train), 50)).astype(int)
train_err = np.zeros(len(sizes))
test_err = np.zeros(len(sizes))
# Create four different models based on max_depth
for k, depth in enumerate([1,3,6,10]):
for i, s in enumerate(sizes):
# Setup a decision tree regressor so that it learns a tree with max_depth = depth
regressor = DecisionTreeRegressor(max_depth = depth)
# Fit the learner to the training data
regressor.fit(X_train[:s], y_train[:s])
# Find the performance on the training set
train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
# Find the performance on the testing set
test_err[i] = performance_metric(y_test, regressor.predict(X_test))
# Subplot the learning curve graph
ax = fig.add_subplot(2, 2, k+1)
ax.plot(sizes, test_err, lw = 2, label = 'Testing Error')
ax.plot(sizes, train_err, lw = 2, label = 'Training Error')
ax.legend()
ax.set_title('max_depth = %s'%(depth))
ax.set_xlabel('Number of Data Points in Training Set')
ax.set_ylabel('Total Error')
ax.set_xlim([0, len(X_train)])
# Visual aesthetics
fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=18, y=1.03)
fig.tight_layout()
fig.show()
def model_complexity(X_train, y_train, X_test, y_test):
""" Calculates the performance of the model as model complexity increases.
The learning and testing errors rates are then plotted. """
print "Creating a model complexity graph. . . "
# We will vary the max_depth of a decision tree model from 1 to 14
max_depth = np.arange(1, 14)
train_err = np.zeros(len(max_depth))
test_err = np.zeros(len(max_depth))
for i, d in enumerate(max_depth):
# Setup a Decision Tree Regressor so that it learns a tree with depth d
regressor = DecisionTreeRegressor(max_depth = d)
# Fit the learner to the training data
regressor.fit(X_train, y_train)
# Find the performance on the training set
train_err[i] = performance_metric(y_train, regressor.predict(X_train))
# Find the performance on the testing set
test_err[i] = performance_metric(y_test, regressor.predict(X_test))
# Plot the model complexity graph
pl.figure(figsize=(7, 5))
pl.title('Decision Tree Regressor Complexity Performance')
pl.plot(max_depth, test_err, lw=2, label = 'Testing Error')
pl.plot(max_depth, train_err, lw=2, label = 'Training Error')
pl.legend()
pl.xlabel('Maximum Depth')
pl.ylabel('Total Error')
pl.show()
learning_curves(X_train, y_train, X_test, y_test)
model_complexity(X_train, y_train, X_test, y_test)
print "Final model has an optimal max_depth parameter of", reg.get_params()['max_depth']
sale_price = reg.predict(CLIENT_FEATURES)
print "Predicted value of client's home: {0:.3f}".format(sale_price[0])