diff --git a/lambda/run_xgboost.py b/lambda/run_xgboost.py index 13abb14..dc52a9d 100644 --- a/lambda/run_xgboost.py +++ b/lambda/run_xgboost.py @@ -15,34 +15,41 @@ def run(event, context): - logger.info("Called run function") + logger.info('Called run function') input_body = json.loads(event['body']) - if 'params' not in input_body: - logger.error("Validation Failed") - raise Exception("Couldn't create the todo item.") xgb_params = input_body['params'] - logger.info("Reading CSV") - df = pd.read_csv("./preprocessed.csv") + nfold = input_body['cv'] + seed = input_body['seed'] + num_boost_round = input_body['num_boost_round'] + + logger.info('Reading CSV') + df = pd.read_csv('./preprocessed.csv') X = df.loc[:, df.columns != 'RainTomorrow'] y = df[['RainTomorrow']] data_dmatrix = xgb.DMatrix(data=X, label=y) - logger.info("Performing Cross Validation") + logger.info('Performing Cross Validation') # this returns a dataframe of mean error values, each row adds another boosting tree. - results = xgb.cv(dtrain=data_dmatrix, params=xgb_params, nfold=3, seed=123, metrics="error") + results = xgb.cv( + dtrain=data_dmatrix, + params=xgb_params, + nfold=nfold, + seed=seed, + num_boost_round=num_boost_round, + metrics='error') logger.info("Done") # select the iteration with the lowest error (with the optimal number of trees ensembled) best_mean_score = results['test-error-mean'].min() response = { - "statusCode": 200, - "body": json.dumps({"status": "OK", "evaluation_score": best_mean_score}), + 'statusCode': 200, + 'body': json.dumps({'status': 'OK', 'error': best_mean_score}), } - logger.info("Sending response", response) + logger.info('Sending response', response) return response