diff --git a/README.md b/README.md index f3bf81d..51ba881 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,23 @@ -# 751 Project - Parallel Machine Learning +# SOFTENG 751 - Parallel Machine Learning (Group 5) + +## Description + +This project attempts to assess the performance of several hyperparameter tuning algorithms: +- Grid Search +- Random Search +- Successive Halving Algorithm (SHA) +- Asynchronous Successive Halving Algorithm (ASHA) + +Grid and random search have been implemented using the [scikit-learn](https://scikit-learn.org/stable/) machine learning +package. While SHA and ASHA have been implemented from scratch, using [this](https://arxiv.org/abs/1810.05934) research paper. +All implementations run in parallel by default, with grid search, random search and SHA using process-based parallelism +and ASHA using asynchronous function calls to an AWS lambda. + +The dataset used can be found [here](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package). + +Results obtained from the scripts in the `benchmarking/` folder can be found in `results.xlsx`. +This spreadsheet compares the performance of all four tuning algorithms, and also contains +data specific to ASHA and the effects of its various input parameters. ## Setup - Install Python 3 @@ -9,6 +28,8 @@ your own AWS Lambda. The code for this lambda function can be found in `lambda/run_xgboost.py`. You will also need to place your AWS credentials in `~/.aws/credentials`, and set a default region in `~/.aws/config`. + - Additionally, you will need to include a copy of your training data in the `lambda/` folder + when uploading to AWS. - Refer to [here](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html) for more details about the credential and configuration files. diff --git a/preprocessed.csv b/data/preprocessed.csv similarity index 100% rename from preprocessed.csv rename to data/preprocessed.csv diff --git a/weatherAUS.csv b/data/weatherAUS.csv similarity index 100% rename from weatherAUS.csv rename to data/weatherAUS.csv diff --git a/lambda/example_input.json b/lambda/example_input.json deleted file mode 100644 index edb6961..0000000 --- a/lambda/example_input.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "body": { - "params": { - "min_child_weight": 10, - "gamma": 2, - "subsample": 1.0, - "colsample_bytree": 0.8, - "max_depth": 4 - } - } -} \ No newline at end of file diff --git a/main.py b/main.py index cfd5392..270e224 100644 --- a/main.py +++ b/main.py @@ -29,7 +29,7 @@ target_column = 'RainTomorrow' - df = pd.read_csv('./preprocessed.csv') + df = pd.read_csv('data/preprocessed.csv') data = { 'X': df.loc[:, df.columns != target_column], 'y': df[[target_column]].values.ravel() diff --git a/xgboost_test.py b/preprocess_data.py similarity index 51% rename from xgboost_test.py rename to preprocess_data.py index 64db224..b932d91 100644 --- a/xgboost_test.py +++ b/preprocess_data.py @@ -2,14 +2,9 @@ import pandas as pd from scipy import stats from sklearn import preprocessing -from sklearn.model_selection import KFold -from sklearn.model_selection import cross_val_score -from xgboost import XGBClassifier # read csv into dataframe -df = pd.read_csv("./weatherAUS.csv", parse_dates=['Date']) - -print('Size of weather data frame is :', df.shape) +df = pd.read_csv("data/weatherAUS.csv", parse_dates=['Date']) # drop columns with useless data (too many null values) df = df.drop(columns=['Sunshine', 'Evaporation', 'Cloud3pm', 'Cloud9am', 'Location', 'RISK_MM', 'Date'], axis=1) @@ -34,22 +29,4 @@ scaler.fit(df) df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns) -df.to_csv("preprocessed.csv") -# separate data from target -X = df.loc[:, df.columns != 'RainTomorrow'] -y = df[['RainTomorrow']].values.ravel() - -# k = 5 has 84.25%, k=5 has 84.01%, 85.19% on all data ( bad cols removed), with date: 85.17 -# select the k most useful columns -# selector = SelectKBest(chi2, k=10) -# selector.fit(X, y) -# X_new = selector.transform(X) - -# fit and evaluate using k_fold - -model = XGBClassifier(n_jobs=-1) -kfold = KFold(n_splits=3, random_state=33) -print("Cross eval starting") - -results = cross_val_score(model, X, y, cv=kfold, verbose=2, n_jobs=-1) -print("Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) +df.to_csv("data/preprocessed.csv")