diff --git a/lambda/package.json b/lambda/package.json new file mode 100644 index 0000000..debbb44 --- /dev/null +++ b/lambda/package.json @@ -0,0 +1,7 @@ +{ + "name": "ASHA-xgboost-evaluation", + "version": "1.0.0", + "description": "Trains and evaluates an XGBoost model for a parallel ASHA implementation", + "author": "", + "license": "MIT" +} \ No newline at end of file diff --git a/lambda/serverless.yml b/lambda/serverless.yml new file mode 100644 index 0000000..609eca7 --- /dev/null +++ b/lambda/serverless.yml @@ -0,0 +1,15 @@ +service: ASHA-xgboost-evaluation + +frameworkVersion: ">=1.2.0 <2.0.0" + +provider: + name: aws + runtime: python3.7 # or python3.7, supported as of November 2018 + +functions: + run: + handler: run_xgboost.run + events: + - http: + path: run + method: post \ No newline at end of file diff --git a/xgboost_test.py b/xgboost_test.py index a54ead7..fb49bcb 100644 --- a/xgboost_test.py +++ b/xgboost_test.py @@ -10,12 +10,12 @@ from sklearn.model_selection import train_test_split # read csv into dataframe -df = pd.read_csv("./weatherAUS.csv") +df = pd.read_csv("./weatherAUS.csv", parse_dates=['Date']) print('Size of weather data frame is :', df.shape) # drop columns with useless data (too many null values) -df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Location','RISK_MM','Date'],axis=1) +df = df.drop(columns=['Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am', 'Location', 'RISK_MM', 'Date'], axis=1) # get rid of nulls df = df.dropna(how='any') @@ -37,14 +37,17 @@ scaler.fit(df) df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns) + +df.to_csv("preprocessed.csv") # separate data from target X = df.loc[:,df.columns!='RainTomorrow'] y = df[['RainTomorrow']] +# k = 5 has 84.25%, k=5 has 84.01%, 85.19% on all data ( bad cols removed), with date: 85.17 # select the k most useful columns -selector = SelectKBest(chi2, k=3) -selector.fit(X, y) -X_new = selector.transform(X) +#selector = SelectKBest(chi2, k=10) +#selector.fit(X, y) +#X_new = selector.transform(X) # fit and evaluate using k_fold @@ -52,7 +55,7 @@ kfold = KFold(n_splits=5, random_state=7) print("Cross eval starting") -results = cross_val_score(model, X_new, y, cv=kfold, verbose=3) +results = cross_val_score(model, X, y, cv=kfold, verbose=3) print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))