added xgboost evaluation on dataset, using kfold. updated reqs

AbhinavBehal · May 16, 2019 · f403113 · f403113
1 parent d0412bc
commit f403113
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 0 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,10 @@
+joblib==0.13.2
 numpy==1.16.3
 pandas==0.24.2
 python-dateutil==2.8.0
 pytz==2019.1
+scikit-learn==0.21.1
 scipy==1.2.1
 six==1.12.0
+sklearn==0.0
 xgboost==0.82
diff --git a/xgboost_test.py b/xgboost_test.py
@@ -0,0 +1,59 @@
+import pandas as pd
+import numpy as np
+from xgboost import XGBClassifier
+from scipy import stats
+
+from sklearn import preprocessing
+from sklearn.feature_selection import SelectKBest, chi2
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import train_test_split
+
+# read csv into dataframe
+df = pd.read_csv("./weatherAUS.csv")
+
+print('Size of weather data frame is :', df.shape)
+
+# drop columns with useless data (too many null values)
+df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Location','RISK_MM','Date'],axis=1)
+
+# get rid of nulls
+df = df.dropna(how='any')
+
+# use "z-score" to remove outliers
+z = np.abs(stats.zscore(df._get_numeric_data()))
+df= df[(z < 3).all(axis=1)]
+
+# replace binary columns with 0 and 1
+df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
+df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)
+
+# change categorical columns into a one-hot encoded format
+categorical_columns = ['WindGustDir', 'WindDir3pm', 'WindDir9am']
+df = pd.get_dummies(df, columns=categorical_columns)
+
+# standardize data
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(df)
+df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
+
+# separate data from target
+X = df.loc[:,df.columns!='RainTomorrow']
+y = df[['RainTomorrow']]
+
+# select the k most useful columns
+selector = SelectKBest(chi2, k=3)
+selector.fit(X, y)
+X_new = selector.transform(X)
+
+# fit and evaluate using k_fold
+
+model = XGBClassifier()
+kfold = KFold(n_splits=5, random_state=7)
+print("Cross eval starting")
+
+results = cross_val_score(model, X_new, y, cv=kfold, verbose=3)
+print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
+
+
+