Skip to content

Commit

Permalink
added xgboost evaluation on dataset, using kfold. updated reqs
Browse files Browse the repository at this point in the history
  • Loading branch information
mfrost433 committed May 16, 2019
1 parent d0412bc commit f403113
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
joblib==0.13.2
numpy==1.16.3
pandas==0.24.2
python-dateutil==2.8.0
pytz==2019.1
scikit-learn==0.21.1
scipy==1.2.1
six==1.12.0
sklearn==0.0
xgboost==0.82
59 changes: 59 additions & 0 deletions xgboost_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from scipy import stats

from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# read csv into dataframe
df = pd.read_csv("./weatherAUS.csv")

print('Size of weather data frame is :', df.shape)

# drop columns with useless data (too many null values)
df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Location','RISK_MM','Date'],axis=1)

# get rid of nulls
df = df.dropna(how='any')

# use "z-score" to remove outliers
z = np.abs(stats.zscore(df._get_numeric_data()))
df= df[(z < 3).all(axis=1)]

# replace binary columns with 0 and 1
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

# change categorical columns into a one-hot encoded format
categorical_columns = ['WindGustDir', 'WindDir3pm', 'WindDir9am']
df = pd.get_dummies(df, columns=categorical_columns)

# standardize data
scaler = preprocessing.MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)

# separate data from target
X = df.loc[:,df.columns!='RainTomorrow']
y = df[['RainTomorrow']]

# select the k most useful columns
selector = SelectKBest(chi2, k=3)
selector.fit(X, y)
X_new = selector.transform(X)

# fit and evaluate using k_fold

model = XGBClassifier()
kfold = KFold(n_splits=5, random_state=7)
print("Cross eval starting")

results = cross_val_score(model, X_new, y, cv=kfold, verbose=3)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



0 comments on commit f403113

Please sign in to comment.