From e12d95d6457ef8eccf9918d6021493f8227a76fd Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 3 Aug 2020 15:27:49 +0300 Subject: [PATCH 01/64] add sklearn ml tool --- models/sklearn/REQUIREMENTS.txt | 13 + models/sklearn/__init__.py | 1 + models/sklearn/classification/__init__.py | 1 + .../classification/classification_task.py | 76 +++++ .../classification_task_manager.py | 136 +++++++++ .../sklearn/classification/classifierBASIC.py | 26 ++ .../sklearn/classification/classifierGRID.py | 153 ++++++++++ .../classification/confusion_matrix.py | 20 ++ models/sklearn/classification/evaluation.py | 279 ++++++++++++++++++ .../classification/report_files_export.py | 26 ++ models/sklearn/classification/train_class.py | 68 +++++ models/sklearn/configuration_template.yaml | 111 +++++++ .../sklearn/create_classification_project.py | 129 ++++++++ .../jmp_results_danceability.param | 11 + .../jmp_results_danceability.results.html | 7 + .../jmp_results_tonal_atonal.param | 11 + .../jmp_results_tonal_atonal.results.html | 7 + .../jmp_results_voice_instrumental.param | 11 + ...mp_results_voice_instrumental.results.html | 7 + models/sklearn/gaia_imitation_best_model.py | 103 +++++++ models/sklearn/logging_tool.py | 107 +++++++ models/sklearn/predict.py | 166 +++++++++++ models/sklearn/transformation/__init__.py | 1 + .../transformation/load_groung_truth.py | 257 ++++++++++++++++ .../sklearn/transformation/load_low_level.py | 106 +++++++ models/sklearn/transformation/transform.py | 265 +++++++++++++++++ .../transformation/transform_predictions.py | 181 ++++++++++++ .../transformation/utils_preprocessing.py | 69 +++++ models/sklearn/utils.py | 166 +++++++++++ 29 files changed, 2514 insertions(+) create mode 100644 models/sklearn/REQUIREMENTS.txt create mode 100644 models/sklearn/__init__.py create mode 100644 models/sklearn/classification/__init__.py create mode 100644 models/sklearn/classification/classification_task.py create mode 100644 models/sklearn/classification/classification_task_manager.py create mode 100644 models/sklearn/classification/classifierBASIC.py create mode 100644 models/sklearn/classification/classifierGRID.py create mode 100644 models/sklearn/classification/confusion_matrix.py create mode 100644 models/sklearn/classification/evaluation.py create mode 100644 models/sklearn/classification/report_files_export.py create mode 100644 models/sklearn/classification/train_class.py create mode 100644 models/sklearn/configuration_template.yaml create mode 100644 models/sklearn/create_classification_project.py create mode 100644 models/sklearn/gaia_best_models/jmp_results_danceability.param create mode 100644 models/sklearn/gaia_best_models/jmp_results_danceability.results.html create mode 100644 models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param create mode 100644 models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html create mode 100644 models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param create mode 100644 models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html create mode 100644 models/sklearn/gaia_imitation_best_model.py create mode 100644 models/sklearn/logging_tool.py create mode 100644 models/sklearn/predict.py create mode 100644 models/sklearn/transformation/__init__.py create mode 100644 models/sklearn/transformation/load_groung_truth.py create mode 100644 models/sklearn/transformation/load_low_level.py create mode 100644 models/sklearn/transformation/transform.py create mode 100644 models/sklearn/transformation/transform_predictions.py create mode 100644 models/sklearn/transformation/utils_preprocessing.py create mode 100644 models/sklearn/utils.py diff --git a/models/sklearn/REQUIREMENTS.txt b/models/sklearn/REQUIREMENTS.txt new file mode 100644 index 000000000..be7c6de28 --- /dev/null +++ b/models/sklearn/REQUIREMENTS.txt @@ -0,0 +1,13 @@ +jupyter==1.0.0 +matplotlib==3.1.3 +numpy==1.18.1 +pandas==1.0.3 +PyYAML==5.3 +scikit-learn==0.23.1 +scipy==1.4.1 +seaborn==0.10.0 +tensorflow==2.1.0 +dask==2.11.0 +dotty-dict==1.2.1 +termcolor==1.1.0 +joblib==0.15.1 \ No newline at end of file diff --git a/models/sklearn/__init__.py b/models/sklearn/__init__.py new file mode 100644 index 000000000..7c68785e9 --- /dev/null +++ b/models/sklearn/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/models/sklearn/classification/__init__.py b/models/sklearn/classification/__init__.py new file mode 100644 index 000000000..7c68785e9 --- /dev/null +++ b/models/sklearn/classification/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/models/sklearn/classification/classification_task.py b/models/sklearn/classification/classification_task.py new file mode 100644 index 000000000..229860878 --- /dev/null +++ b/models/sklearn/classification/classification_task.py @@ -0,0 +1,76 @@ +import os +from classification.classifierGRID import TrainGridClassifier +import json +from termcolor import colored +from classification.classifierBASIC import TrainClassifier +from classification.evaluation import fold_evaluation +from logging_tool import LoggerSetup + + +class ClassificationTask: + def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, log_level): + self.config = config + self.classifier = classifier + self.train_class = train_class + self.log_level = log_level + + self.X = X + self.y = y + self.training_processes = training_processes + self.exports_path = exports_path + self.tracks = tracks + self.logger = "" + + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="train_class_{}".format(self.train_class), + train_class=self.train_class, + mode="a", + level=self.log_level).setup_logger() + + def run(self): + # grid search train + if self.config["train_kind"] == "grid": + self.logger.info("Train Classifier: Classifier with GridSearchCV") + grid_svm_train = TrainGridClassifier(config=self.config, + classifier=self.classifier, + class_name=self.train_class, + X=self.X, + y=self.y, + tr_processes=self.training_processes, + exports_path=self.exports_path, + log_level=self.log_level + ) + grid_svm_train.train_grid_search_clf() + grid_svm_train.export_best_classifier() + elif self.classifier == "NN": + self.logger.info("Train Classifier: Neural Networks") + pass + + self.logger.info("Training is completed successfully..") + + # load best model + self.logger.info("Loading Best Model..") + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + best_model_name = "best_model_{}.json".format(self.train_class) + with open(os.path.join(self.exports_path, exports_dir, best_model_name)) as best_model_file: + best_model = json.load(best_model_file) + print(colored("BEST MODEL:", "cyan")) + print(best_model) + self.logger.info("Best Model loaded successfully.") + + # clf_model = TrainClassifier(classifier=self.classifier, params=best_model["params"]).model() + print("Best model loaded..") + fold_evaluation(config=self.config, + n_fold=best_model["n_fold"], + X=self.X, y=self.y, + class_name=self.train_class, + tracks=self.tracks, + process=best_model["preprocessing"], + exports_path=self.exports_path, + log_level=self.log_level + ) diff --git a/models/sklearn/classification/classification_task_manager.py b/models/sklearn/classification/classification_task_manager.py new file mode 100644 index 000000000..b64a84025 --- /dev/null +++ b/models/sklearn/classification/classification_task_manager.py @@ -0,0 +1,136 @@ +import os +from time import time +from termcolor import colored +from utils import load_yaml, FindCreateDirectory, TrainingProcesses +from classification.classification_task import ClassificationTask +from datetime import datetime +from logging_tool import LoggerSetup + + +validClassifiers = ['NN', 'svm'] +validEvaluations = ['nfoldcrossvalidation'] + + +class ClassificationTaskManager: + """ + + """ + def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): + """ + + :param yaml_file: The configuration file name + :param train_class: The class that will be trained + :param X: The already shuffled data that contain the features + :param y: The already shuffled data that contain the labels + """ + self.config = config + self.train_class = train_class + self.X = X + self.y = y + self.tracks = tracks + self.exports_path = exports_path + self.log_level = log_level + + self.exports_dir = "" + self.results_path = "" + self.logs_path = "" + self.tracks_path = "" + self.dataset_path = "" + self.models_path = "" + self.images_path = "" + self.reports_path = "" + + self.logger = "" + self.setting_logger() + self.files_existence() + self.config_file_analysis() + + def setting_logger(self): + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="train_class_{}".format(self.train_class), + train_class=self.train_class, + mode="a", + level=self.log_level).setup_logger() + + def files_existence(self): + """ + Ensure that all the folders will exist before the training process starts + :return: + """ + # main exports + self.exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + # train results exports + self.results_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "results")).inspect_directory() + # logs + self.logs_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "logs")).inspect_directory() + # tracks + self.tracks_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "tracks_csv_format")).inspect_directory() + # datasets + self.dataset_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "dataset")).inspect_directory() + # models + self.models_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "models")).inspect_directory() + # images + self.images_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "images")).inspect_directory() + # reports + self.reports_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "reports")).inspect_directory() + + def config_file_analysis(self): + self.logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") + if 'processing' not in self.config: + self.logger.error('No preprocessing defined in config.') + + if 'evaluations' not in self.config: + self.logger.error('No evaluations defined in config.') + self.logger.error('Setting default evaluation to 10-fold cross-validation') + self.config['evaluations'] = {'nfoldcrossvalidation': [{'nfold': [10]}]} + + for classifier in self.config['classifiers'].keys(): + if classifier not in validClassifiers: + self.logger.error('Not a valid classifier: {}'.format(classifier)) + raise ValueError('The classifier name must be valid.') + + for evaluation in self.config['evaluations'].keys(): + if evaluation not in validEvaluations: + self.logger.error('Not a valid evaluation: {}'.format(evaluation)) + raise ValueError("The evaluation must be valid.") + self.logger.info("No errors in config file format found.") + + def apply_processing(self): + start_time = time() + training_processes = TrainingProcesses(self.config).training_processes() + self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) + for classifier in self.config["classifiers"].keys(): + print("Before Classification task: ", classifier) + task = ClassificationTask(config=self.config, + classifier=classifier, + train_class=self.train_class, + training_processes=training_processes, + X=self.X, + y=self.y, + exports_path=self.exports_path, + tracks=self.tracks, + log_level=self.log_level + ) + try: + task.run() + except Exception as e: + self.logger.error('Running task failed: {}'.format(e)) + print(colored('Running task failed: {}'.format(e), "red")) + end_time = time() + + print() + print(colored("Last evaluation took place at: {}".format(datetime.now()), "magenta")) + self.logger.info("Last evaluation took place at: {}".format(datetime.now())) + + # test duration + time_duration = end_time - start_time + classification_time = round(time_duration / 60, 2) + return classification_time diff --git a/models/sklearn/classification/classifierBASIC.py b/models/sklearn/classification/classifierBASIC.py new file mode 100644 index 000000000..1edeee76f --- /dev/null +++ b/models/sklearn/classification/classifierBASIC.py @@ -0,0 +1,26 @@ +from sklearn.svm import SVC + + +class TrainClassifier: + def __init__(self, classifier, params): + self.classifier = classifier + self.params = params + + def model(self): + validClassifiers = ['NN', 'svm'] + if self.classifier not in validClassifiers: + raise ValueError('The classifier name must be valid.') + + if self.classifier == "svm": + param_C = self.params["C"] + param_gamma = self.params["gamma"] + param_class_weight = self.params["class_weight"] + param_kernel = self.params["kernel"] + model = SVC(C=param_C, # 2 ** param_C + gamma=param_gamma, # 2 ** param_gamma + kernel=param_kernel, + class_weight=param_class_weight, + probability=True) + return model + else: + return None diff --git a/models/sklearn/classification/classifierGRID.py b/models/sklearn/classification/classifierGRID.py new file mode 100644 index 000000000..b8556a75f --- /dev/null +++ b/models/sklearn/classification/classifierGRID.py @@ -0,0 +1,153 @@ +import os +import json +import math +from pprint import pprint +from termcolor import colored +import joblib +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +from sklearn.model_selection import KFold + +from transformation.transform import Transform +from utils import load_yaml, FindCreateDirectory, TrainingProcesses +from logging_tool import LoggerSetup + + +class TrainGridClassifier: + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, log_level): + self.config = config + self.classifier = classifier + self.class_name = class_name + self.X = X + self.y = y + self.tr_processes = tr_processes + self.exports_path = exports_path + self.log_level = log_level + + self.logger = "" + self.best_models_list = [] + # self.train_grid_search_clf() + + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="train_class_{}".format(self.class_name), + train_class=self.class_name, + mode="a", + level=self.log_level).setup_logger() + + def train_grid_search_clf(self): + process_counter = 1 + for tr_process in self.tr_processes: + print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) + self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) + # initiate SVM classifier object + if self.classifier == "svm": + grid_clf = SVC(gamma="auto", probability=True) + # TODO: different classifier object (e.g. random forests, knn, etc) can be initiated here + else: + raise ValueError('The classifier name must be valid.') + + print("CLASSIFIER", tr_process["classifier"]) + # transformation of the data + features_prepared = Transform(config=self.config, + df_feats=self.X, + process=tr_process["preprocess"], + train_class=self.class_name, + exports_path=self.exports_path, + log_level=self.log_level).post_processing() + + # define the length of parameters + parameters_grid = {'kernel': tr_process["kernel"], + 'C': tr_process["C"], + 'gamma': tr_process["gamma"], + 'class_weight': tr_process["balanceClasses"] + } + + # inner with K-Fold cross-validation declaration + random_seed = None + shuffle = self.config["k_fold_shuffle"] + if shuffle is True: + random_seed = self.config["seed"] + elif shuffle is False: + random_seed = None + self.logger.info("Fitting the data to the classifier with K-Fold cross-validation..") + inner_cv = KFold(n_splits=tr_process["n_fold"], + shuffle=shuffle, + random_state=random_seed + ) + # initiate GridSearch Object + gsvc = GridSearchCV(estimator=grid_clf, + param_grid=parameters_grid, + cv=inner_cv, + n_jobs=self.config["parallel_jobs"], + verbose=self.config["verbose"] + ) + + self.logger.debug("Shape of X before train: {}".format(features_prepared.shape)) + self.logger.info("Fitting the data to the model..") + gsvc.fit(features_prepared, self.y) + + # print(gsvc.cv_results_["params"]) + self.logger.info("Results from each best preprocess training:") + self.logger.info("a) Best score: {}".format(gsvc.best_score_)) + self.logger.info("b) Best estimator: {}".format(gsvc.best_estimator_)) + self.logger.info("c) Best parameters: {}".format(gsvc.best_params_)) + self.logger.info("Counted evaluations in this GridSearch process: {}".format(len(gsvc.cv_results_["params"]))) + + # save best results for each train process + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.class_name) + results_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "results")).inspect_directory() + results_best_dict_name = "result_{}_{}_best_{}.json"\ + .format(self.class_name, tr_process["preprocess"], gsvc.best_score_) + + results_dict = dict() + results_dict["score"] = gsvc.best_score_ + results_dict["params"] = gsvc.best_params_ + results_dict["n_fold"] = tr_process['n_fold'] + results_dict["preprocessing"] = tr_process["preprocess"] + with open(os.path.join(results_path, results_best_dict_name), 'w') as grid_best_json: + json.dump(results_dict, grid_best_json, indent=4) + + # export parameters that the + results_params_dict_name = "result_{}_{}_params_{}.json"\ + .format(self.class_name, tr_process["preprocess"], gsvc.best_score_) + with open(os.path.join(results_path, results_params_dict_name), 'w') as grid_params_json: + json.dump(gsvc.cv_results_["params"], grid_params_json, indent=0) + + models_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) + joblib.dump(gsvc.best_estimator_, best_process_model_path) + self.logger.info("Grid Best model for the {} process saved.".format(tr_process["preprocess"])) + + # return a list that includes the best models exported from each processing + self.best_models_list.append(results_dict) + + print(colored("Next train process..", "yellow")) + process_counter += 1 + print() + print() + print(colored("Finishing training processes..", "blue")) + print() + + def export_best_classifier(self): + # gather best scores from the exported grid clf models + scores = [x["score"] for x in self.best_models_list] + self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) + for model in self.best_models_list: + if model["score"] == max(scores): + self.logger.info("Best {} model parameters:".format(self.class_name)) + # log2 --> convert values to initial parameters' values + # model["params"]["C"] = math.log2(model["params"]["C"]) + # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) + self.logger.info("{}".format(model)) + best_model_name = "best_model_{}.json".format(self.class_name) + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.class_name) + with open(os.path.join(self.exports_path, exports_dir, best_model_name), "w") as best_model: + json.dump(model, best_model, indent=4) + self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) diff --git a/models/sklearn/classification/confusion_matrix.py b/models/sklearn/classification/confusion_matrix.py new file mode 100644 index 000000000..a7af4f4ae --- /dev/null +++ b/models/sklearn/classification/confusion_matrix.py @@ -0,0 +1,20 @@ +class ConfusionMatrix: + + def __init__(self, matrix, classes): + self.matrix = matrix + self.classes = classes + + def toHtml(self): + html = '' + html += '' + html += '' + html += '' + html += '' + html += '' + html += '

Predicted (%)

' + html += '' + + html += '' + + labels = self.classes() + diff --git a/models/sklearn/classification/evaluation.py b/models/sklearn/classification/evaluation.py new file mode 100644 index 000000000..5cf5c0d6a --- /dev/null +++ b/models/sklearn/classification/evaluation.py @@ -0,0 +1,279 @@ +import os +import json +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from termcolor import colored +from pprint import pprint +import yaml +from sklearn.model_selection import KFold +from sklearn.metrics import accuracy_score +from sklearn.metrics import confusion_matrix, classification_report +import joblib +import requests +from utils import load_yaml, FindCreateDirectory, TrainingProcesses +from transformation.transform import Transform +from transformation.utils_preprocessing import flatten_dict_full +from classification.report_files_export import export_report +from logging_tool import LoggerSetup + + +def fold_evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, log_level): + # logger setup + logger = LoggerSetup(config=config, + exports_path=exports_path, + name="evaluation_{}".format(class_name), + train_class=class_name, + mode="w", + level=log_level).setup_logger() + + logger.info("---- EVALUATION of the model in the Folded dataset as well as in the whole dataset ----") + print("---- EVALUATION of the model in the Folded dataset as well as in the whole dataset ----") + print(colored("Evaluation and Folding..", "yellow")) + logger.info("number of folds set to config: {}".format(n_fold)) + logger.debug("Sample of shuffled tracks tracks:") + logger.debug("{}".format(tracks[:5])) + logger.debug("Tracks list length: {}".format(len(tracks))) + + exports_dir = "{}_{}".format(config.get("exports_directory"), class_name) + + # load best model + load_model_params_path = os.path.join(exports_path, exports_dir, "best_model_{}.json".format(class_name)) + with open(load_model_params_path) as model_params_file: + model_params_data = json.load(model_params_file) + + logger.info("Best model preprocessing step: {}".format(process)) + models_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + clf = joblib.load(os.path.join(models_path, "model_grid_{}.pkl".format(process))) + logger.info("Best model loaded.") + + # inner with K-Fold cross-validation declaration + random_seed = None + shuffle = config["k_fold_shuffle"] + if shuffle is True: + random_seed = config["seed"] + elif shuffle is False: + random_seed = None + print("Fitting the data to the classifier with K-Fold cross-validation..") + logger.info("Fitting the data to the classifier with K-Fold cross-validation..") + inner_cv = KFold(n_splits=n_fold, + shuffle=shuffle, + random_state=random_seed + ) + logger.debug("Type of X: {}".format(type(X))) + logger.debug("Type of y: {}".format(type(y))) + # tracks_fold_indexing = [] + tracks_fold_indexing_dict = {} + print(tracks[0]) + print(tracks[4]) + + # transformation of the data + features_prepared = Transform(config=config, + df_feats=X, + process=process, + train_class=class_name, + exports_path=exports_path, + log_level=log_level).post_processing() + logger.debug("features prepared shape: {}".format(features_prepared.shape)) + + accuracy_model = [] + predictions_df_list = [] + fold_number = 0 + for train_index, test_index in inner_cv.split(features_prepared): + print("Fold: {}".format(fold_number)) + logger.info("FOLD: {}".format(fold_number)) + # print("TRAIN INDEX: ", train_index) + print("first test index element: {} - last test index element: {}".format(test_index[0], test_index[-1])) + logger.debug("first test index element: {} - last test index element: {}".format(test_index[0], test_index[-1])) + logger.debug("TEST INDEX: {}".format(test_index)) + logger.debug("Length of the test index array: {}".format(len(test_index))) + + tracks_count = 0 + tracks_list = [] + for index in test_index: + # print(tracks[index]) + tracks_fold_indexing_dict[tracks[index]] = fold_number + tracks_list.append(tracks[index]) + tracks_count += 1 + print(colored("Tracks indexed to the specific fold: {}".format(tracks_count), "cyan")) + X_train, X_test = features_prepared[train_index], features_prepared[test_index] + y_train, y_test = y[train_index], y[test_index] + # Train the model + print("Fitting for fold {}".format(fold_number)) + clf.fit(X_train, y_train) + logger.info("Classifier classes: {}".format(clf.classes_)) + # predictions + print("Predicting for the specific fold..") + logger.info("Predicting for the specific fold..") + logger.info("Predictions outputs") + pred = clf.predict(X_test) + logger.debug("predictions type after applying classifier's predict {}".format(type(pred))) + logger.debug("predictions shape: {}".format(pred.shape)) + df_pred = pd.DataFrame(data=pred, index=test_index, columns=["predictions"]) + logger.debug("Transforming to dataframe") + logger.debug("\n{}".format(df_pred.head())) + # predictions probabilities + logger.info("Predictions Probabilities outputs") + pred_prob = clf.predict_proba(X_test) + df_pred_prob = pd.DataFrame(data=pred_prob, index=test_index, columns=clf.classes_) + logger.debug("Transforming to dataframe") + logger.debug("\n{}".format(df_pred_prob.head())) + print("Tracks dataframe set..") + logger.info("Tracks dataframe set..") + # tracks df + df_tracks = pd.DataFrame(data=tracks_list, index=test_index, columns=["track"]) + logger.debug("\n{}".format(df_tracks.head())) + # y_test series + print("True values set..") + logger.info("True values set..") + logger.debug("Transforming to dataframe") + y_test_series = pd.DataFrame(data=y_test, index=test_index, columns=[class_name]) + logger.debug("\n{}".format(y_test_series.head())) + # concatenate dfs + logger.info("Concatenating DF..") + df_pred_general = pd.concat([df_tracks, df_pred_prob, df_pred, y_test_series], axis=1, ignore_index=False) + logger.debug("\n{}".format(df_pred_general.head())) + # predictions_all_df.append(df_pred_general, ignore_index=True) + predictions_df_list.append(df_pred_general) + # Append to accuracy_model the accuracy of the model + accuracy_model.append(accuracy_score(y_test, clf.predict(X_test), normalize=True) * 100) + fold_number += 1 + + print() + print() + # concatenate predictions dfs + print(colored("Make Predictions DataFrame for all the folded instances together..", "cyan")) + logger.info("Make Predictions DataFrame for all the folded instances together..") + df_predictions = pd.concat(predictions_df_list) + logger.debug("\n{}".format(df_predictions.head())) + logger.debug("Info:") + logger.debug("\n{}".format(df_predictions.info())) + # save predictions df + logger.info("Saving the unified predictions DataFrame locally.") + dataset_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "dataset")).inspect_directory() + df_predictions.to_csv(os.path.join(dataset_path, "predictions_{}.csv".format(class_name))) + + # ACCURACIES + print(colored("Accuracies in each fold: {}".format(accuracy_model), "cyan")) + print(colored("Mean of accuracies: {}".format(np.mean(accuracy_model)), "cyan")) + print(colored("Standard Deviation of accuracies: {}".format(np.std(accuracy_model)), "cyan")) + logger.info("Accuracies in each fold: {}".format(accuracy_model)) + logger.info("Mean of accuracies: {}".format(np.mean(accuracy_model))) + logger.info("Standard Deviation of accuracies: {}".format(np.std(accuracy_model))) + accuracies_export = "Accuracies in each fold: {} \nMean of accuracies: {} \nStandard Deviation of accuracies: {}"\ + .format(accuracy_model, np.mean(accuracy_model), np.std(accuracy_model)) + export_report(config=config, + name="Accuracies results", + report=accuracies_export, + filename="accuracies_results_fold", + train_class=class_name, + exports_path=exports_path) + + # Visualize accuracy for each iteration + logger.info("Visualize accuracy for each iteration..") + list_folds = [] + counter_folds = 0 + for accuracy in accuracy_model: + list_folds.append("Fold{}".format(counter_folds)) + counter_folds += 1 + print("Exporting accuracies distribution to plot file..") + logger.info("Exporting accuracies distribution to plot file..") + scores = pd.DataFrame(accuracy_model, columns=['Scores']) + sns.set(style="white", rc={"lines.linewidth": 3}) + sns.barplot(x=list_folds, y="Scores", data=scores) + images_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "images")).inspect_directory() + plt.savefig(os.path.join(images_path, "accuracies_distribution.png")) + sns.set() + plt.close() + logger.info("Plot saved successfully.") + + # Folded Tracks Dictionary + print("Writing Folded Tracks Dictionary locally to check where each track is folded..") + logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") + logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) + folded_dataset_path = os.path.join(dataset_path, "{}.yaml".format(class_name)) + with open(folded_dataset_path, 'w') as file: + folded_dataset = yaml.dump(tracks_fold_indexing_dict, file) + logger.info("Folded dataset written successfully to disk.") + + # EVALUATION REPORTS + print(colored("Evaluation Reports", "cyan")) + logger.info("---- EVALUATION REPORTS ----") + + # Folded Dataset + print(colored("Evaluation to the folded dataset..", "cyan")) + logger.info("Evaluation to the folded dataset..") + + # Confusion Matrix + print("Exporting Confusion Matrix applied to the folded dataset..") + logger.info("Confusion Matrix applied to the folded dataset..") + cm = confusion_matrix(y_true=df_predictions[class_name], y_pred=df_predictions["predictions"]) + logger.info("\n{}".format(cm)) + + # Confusion Matrix Normalized + print("Exporting Normalized Confusion Matrix applied to the folded dataset..") + logger.info("Normalized Confusion Matrix applied to the folded dataset..") + cm_normalized = (cm / cm.astype(np.float).sum(axis=1) * 100) + logger.info("\n{}".format(cm_normalized)) + cm_all = "Actual instances\n{}\n\nNormalized\n{}".format(cm, cm_normalized) + export_report(config=config, + name="Folded Data Confusion Matrix", + report=cm_all, + filename="confusion_matrix_fold", + train_class=class_name, + exports_path=exports_path) + + # Classification Report + print("Exporting Classification Report applied to the folded dataset..") + logger.info("Classification Report applied to the folded dataset..") + cr = classification_report(y_true=df_predictions[class_name], y_pred=df_predictions["predictions"]) + export_report(config=config, + name="Folded Data Classification Report", + report=cr, + filename="classification_report_fold", + train_class=class_name, + exports_path=exports_path) + + logger.info("The folded dataset has been evaluated successfully..") + print(colored("The folded dataset has been evaluated successfully..", "green")) + + # # save the model + # models_path = FindCreateDirectory(os.path.join(exports_path, "models")).inspect_directory() + # model_save_path = os.path.join(models_path, "model.pkl") + # joblib.dump(clf, model_save_path) + # + # train with all the data of the dataset + print(colored("Evaluation to the whole dataset..", "cyan")) + logger.info("Evaluation to the whole dataset..") + clf.fit(features_prepared, y) + predictions_proba_all = clf.predict_proba(features_prepared) + predictions_all = clf.predict(features_prepared) + logger.info("Confusion Matrix applied to the whole dataset..") + cm_full = confusion_matrix(y_true=y, y_pred=predictions_all) + logger.info("\n{}".format(cm_full)) + logger.info("Normalized Confusion Matrix applied to the whole dataset..") + cm_full_normalized = (cm_full / cm_full.astype(np.float).sum(axis=1) * 100) + logger.info("\n{}".format(cm_full_normalized)) + cm_full_all = "Actual instances\n{}\n\nNormalized\n{}".format(cm_full, cm_full_normalized) + export_report(config=config, + name="All Data Confusion Matrix", + report=cm_full_all, + filename="confusion_matrix_all_dataset", + train_class=class_name, + exports_path=exports_path) + logger.info("Classification Report applied to the whole dataset..") + cr_full = classification_report(y_true=y, y_pred=predictions_all) + export_report(config=config, + name="All Data Classification Report", + report=cr_full, + filename="classification_report_all_dataset", + train_class=class_name, + exports_path=exports_path) + + logger.info("The whole dataset has been evaluated successfully..") + print(colored("The whole dataset has been evaluated successfully..", "green")) + diff --git a/models/sklearn/classification/report_files_export.py b/models/sklearn/classification/report_files_export.py new file mode 100644 index 000000000..d946dbc47 --- /dev/null +++ b/models/sklearn/classification/report_files_export.py @@ -0,0 +1,26 @@ +import os +from datetime import datetime +from termcolor import colored +from utils import load_yaml, FindCreateDirectory, TrainingProcesses + + +def export_report(config, name, report, filename, train_class, exports_path): + exports_dir = "{}_{}".format(config.get("exports_directory"), train_class) + reports_path = FindCreateDirectory(exports_path, os.path.join(exports_dir, "reports")).inspect_directory() + # take current date and convert to string + now = datetime.now() + datetime_str = now.strftime("%Y-%m-%d") + datetime_str_verbose = now.strftime("%Y-%m-%d, %H:%M:%S") + print("Creating report file..") + with open(os.path.join(reports_path, "{}.txt".format(filename)), 'w+') as file: + file.write("{}".format(name)) + file.write('\n') + file.write('\n') + file.write(str(report)) + file.write('\n') + file.write('\n') + file.write('\n') + file.write("Date of execution: {}".format(datetime_str_verbose)) + file.close() + print(colored('{} file for class {} is created successfully.'.format(name, train_class), "cyan")) + diff --git a/models/sklearn/classification/train_class.py b/models/sklearn/classification/train_class.py new file mode 100644 index 000000000..04d95b029 --- /dev/null +++ b/models/sklearn/classification/train_class.py @@ -0,0 +1,68 @@ +import os +from termcolor import colored +from transformation.load_groung_truth import GroundTruthLoad +from classification.classification_task_manager import ClassificationTaskManager +from transformation.load_groung_truth import DatasetExporter +import yaml +from logging_tool import LoggerSetup + + +def train_class(config, gt_file, log_level): + exports_path = config["exports_path"] + gt_data = GroundTruthLoad(config, gt_file, exports_path, log_level) + # tracks shuffled and exported + tracks_listed_shuffled = gt_data.export_gt_tracks() + + # class to train + class_name = gt_data.export_train_class() + config["class_name"] = class_name + + logger = LoggerSetup(config=config, + exports_path=exports_path, + name="train_class_{}".format(class_name), + train_class=class_name, + mode="w", + level=log_level).setup_logger() + + logger.info("---- TRAINING FOR THE {} MODEL HAS JUST STARTED ----".format(class_name)) + + logger.debug("Type of exported GT data exported: {}".format(type(tracks_listed_shuffled))) + + # save project file + project_file_name_save = "{}_{}.yaml".format(config["project_file"], class_name) + project_file_save_path = os.path.join(exports_path, project_file_name_save) + with open(os.path.join(project_file_save_path), "w") as template_file: + template_data_write = yaml.dump(config, template_file) + + print("First N sample of shuffled tracks: \n{}".format(tracks_listed_shuffled[:4])) + + # create the exports with the features DF, labels, and tracks together + features, labels, tracks = DatasetExporter(config=config, + tracks_list=tracks_listed_shuffled, + train_class=class_name, + exports_path=exports_path, + log_level=log_level + ).create_df_tracks() + logger.debug("Types of exported files from GT:") + logger.debug("Type of features: {}".format(type(features))) + logger.debug("Type of labels: {}".format(type(labels))) + logger.debug("Type of Tracks: {}".format(type(tracks))) + + print(colored("Small previews:", "cyan")) + print(colored("FEATURES", "magenta")) + print(features.head(3)) + print(colored("LABELS", "magenta")) + print(labels[:10]) + print(colored("TRACKS:", "magenta")) + print(tracks[:10]) + + model_manage = ClassificationTaskManager(config=config, + train_class=class_name, + X=features, + y=labels, + tracks=tracks, + exports_path=exports_path, + log_level=log_level) + classification_time = model_manage.apply_processing() + print(colored("Classification ended in {} minutes.".format(classification_time), "green")) + logger.info("Classification ended in {} minutes.".format(classification_time)) diff --git a/models/sklearn/configuration_template.yaml b/models/sklearn/configuration_template.yaml new file mode 100644 index 000000000..036efe802 --- /dev/null +++ b/models/sklearn/configuration_template.yaml @@ -0,0 +1,111 @@ +# READ GROUND TRUTH +# the ground truth data directory +ground_truth_directory: +exports_path: +# classes with features locally: c, gender, genre_rosamerica, moods_claurier, moods_mirex, timbre_bright_dark +# classes with features locally: tonal_atonal, voice_instrumental +# classes with features online: genre_dortmund, genre_electronic, genre_tzanetakis, ismir04_rhythm, +class_dir: +class_name: +exports_directory: +logging_level: # logging level +seed: # set null to get the seed from the clock value, otherwise specify a number + +# PRE-PROCESSING +# List of parameters that have to be excluded before applying the transformation steps +excludedDescriptors: [ 'metadata.tags*' ] +# List of preprocessed datasets to build +processing: + # it is possible to not apply any processing, although this is of + # of little value in real-life tests and evaluations + raw: [] + + basic: + - transfo: remove + params: { descriptorNames: &unusedDescs [ 'metadata.*', '*dmean*', '*dvar*', + '*.min', '*.max', '*cov', + 'tonal.thpcp', # because of division by zero + 'lowlevel.spectral_energyband_high.*', # 0 for low samplerate + 'lowlevel.silence_rate*' # funky behavior in general + ] } + - transfo: enumerate + params: { descriptorNames: &stringDescs [ # 'rhythm.perceptual_tempo', # removed from new extractor + 'tonal.chords_key', 'tonal.chords_scale', + 'tonal.key_key', 'tonal.key_scale' ] } + + lowlevel: + # note that the order of the transformations is important! + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: select + params: { descriptorNames: ['lowlevel*'] } + + nobands: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: remove + params: { descriptorNames: [ 'barkbands*', '*energyband*', 'melbands*', 'erbbands*' ] } + + normalized: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: normalize # MixMax Scale + + gaussianized: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: normalize # MixMax Scale + - transfo: gaussianize # QuantileTransformer + params: { descriptorNames: ['lowlevel.*'] } + + mfcc: + # an MFCC only baseline + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: select + params: { descriptorNames: ['lowlevel.mfcc*'] } + +## ML SETTINGS +# train kind: grid, svm, deep_learning, supervised_lb +train_kind: grid +k_fold_shuffle: False + +# GRID ML SETTINGS +# PCA number of best components +pca_n_components: .95 +parallel_jobs: # set to -1 if to exploit all processors. Set to null to exploit only 1 processor +verbose: # 0: no verbose, 1: simple information about the tasks completed, 2: full information of all the tasks + +# NEURAL NETWORK SETTINGS +# + +# List of classifiers to be trained +classifiers: + svm: + # first svm test combinations + - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized', 'mfcc' ] +# - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized'] + type: [ 'C-SVC' ] + kernel: [ 'poly', 'RBF' ] + C: [ -5, -3, -1, 1, 3, 5, 7, 9, 11 ] # will actually be 2**x + gamma: [ 3, 1, -1, -3, -5, -7, -9, -11 ] # will actually be 2**x + # if True, weight classes based on the number of elements + balanceClasses: [False, True] + # descriptorNames: [ ['*.mean', '*.var'] ] + # more svm params combinations + # ... + +# List of evaluations to be performed +evaluations: + nfoldcrossvalidation: + - nfold: [ 5 ] \ No newline at end of file diff --git a/models/sklearn/create_classification_project.py b/models/sklearn/create_classification_project.py new file mode 100644 index 000000000..cb9ab4e04 --- /dev/null +++ b/models/sklearn/create_classification_project.py @@ -0,0 +1,129 @@ +import os +import argparse +from pprint import pprint +from utils import load_yaml +import yaml +import time +from transformation.load_groung_truth import ListGroundTruthFiles +from classification.train_class import train_class + + +def create_classification_project(ground_truth_directory, class_dir, project_file, exports_directory, logging, seed, jobs, verbose, exports_path): + """ + + :param ground_truth_directory: + :param class_dir: + :param project_file: + :param exports_directory: + :param logging: + :param seed: + :param jobs: + :param verbose: + :param exports_path: + :return: + """ + try: + project_template = load_yaml("configuration_template.yaml") + except Exception as e: + print('Unable to open project configuration template:', e) + raise + + # print("BEFORE:") + # print("Type of congig template:", type(project_template)) + print("-------------------------------------------------------") + print() + if seed is None: + seed = time.time() + + print("Seed argument: {}".format(seed)) + + project_template["ground_truth_directory"] = ground_truth_directory + project_template["class_dir"] = class_dir + project_template["project_file"] = project_file + project_template["exports_directory"] = exports_directory + project_template["logging_level"] = logging + project_template["seed"] = seed + project_template["parallel_jobs"] = jobs + project_template["verbose"] = verbose + + # if empty, path is declared as the app's main directory + if exports_path is None: + exports_path = os.getcwd() + + print("Exports path: {}".format(exports_path)) + project_template["exports_path"] = exports_path + + print() + print() + print("-------------------------------------------------------") + # print("AFTER:") + # pprint(project_template) + + gt_files_list = ListGroundTruthFiles(project_template).list_gt_filenames() + print(gt_files_list) + print("LOAD GROUND TRUTH") + for gt_file in gt_files_list: + train_class(project_template, gt_file, logging) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generates a project configuration file given a filelist, a groundtruth file, ' + 'and the directories to store the datasets and the results files. ' + 'The script has a parameter to specify the project template to use. ' + 'If it is not specified, it will try to guess the appropriated one from the ' + 'essentia version found on the descriptor files.') + + parser.add_argument('-g', '--groundtruth', + dest="ground_truth_directory", + default="datasets", + help='Name of the directory containing the datasets.') + + parser.add_argument('-c', '--classdir', + dest="class_dir", + help='Name of the directory containing the class or classes to train.', + required=True) + + parser.add_argument('-f', '--file', + dest="project_file", + default="project", + help='Name prefix of the project configuration file (.yaml) will be stored.') + + parser.add_argument('-e', '--exportsdir', + dest="exports_directory", + default="exports", + help='Path the exports of the project will be stored.') + + parser.add_argument('-l', '--logging', + default=1, + help='Path where the result files will be stored.', + type=int) + + parser.add_argument('-s', '--seed', + default=None, + help='Seed used to generate the shuffled dataset applied later to folding.', + type=int) + + parser.add_argument('-j', '--jobs', + default=-1, + help='Parallel jobs. Set to -1 to use all the available cores', + type=int) + parser.add_argument('-v', '--verbose', + default=1, + help="Controls the verbosity: the higher, the more messages.", + type=int) + parser.add_argument('-p', '--path', + dest='exports_path', + help='Path where the project results will be stored. If empty, the results will be saved in ' + 'app directory') + + # parser.add_argument('-t', '--template', + # default=None, + # help='classification project template file to use. ' + # 'If not specified, the script will try to detect it from the descriptors metadata.') + + args = parser.parse_args() + + create_classification_project(args.ground_truth_directory, args.class_dir, args.project_file, + args.exports_directory, logging=args.logging, seed=args.seed, jobs=args.jobs, + verbose=args.verbose, exports_path=args.exports_path) diff --git a/models/sklearn/gaia_best_models/jmp_results_danceability.param b/models/sklearn/gaia_best_models/jmp_results_danceability.param new file mode 100644 index 000000000..c2b7fdb37 --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_danceability.param @@ -0,0 +1,11 @@ +evaluation: + nfold: 5 + type: nfoldcrossvalidation +model: + C: 5 + balanceClasses: false + classifier: svm + gamma: -9 + kernel: RBF + preprocessing: gaussianized + type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_danceability.results.html b/models/sklearn/gaia_best_models/jmp_results_danceability.results.html new file mode 100644 index 000000000..372493c7c --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_danceability.results.html @@ -0,0 +1,7 @@ +

test_danceability (/data/project_danceability.yaml)

+Accuracy: 93.3333333333. Std: 3.00032966083. +Normalized accuracy: 92.6993290685. Normalized std: 2.83367775302. + + + +

Predicted (%)

danceablenot_danceableProportion
danceable95.14 137 danceable (out of 144) classified as danceable4.86 7 danceable (out of 144) classified as not_danceabledanceable64.00 %
not_danceable9.88 8 not_danceable (out of 81) classified as danceable90.12 73 not_danceable (out of 81) classified as not_danceablenot_danceable36.00 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param new file mode 100644 index 000000000..47c151556 --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param @@ -0,0 +1,11 @@ +evaluation: + nfold: 5 + type: nfoldcrossvalidation +model: + C: 7 + balanceClasses: true + classifier: svm + gamma: -9 + kernel: RBF + preprocessing: basic + type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html new file mode 100644 index 000000000..ddf60ba27 --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html @@ -0,0 +1,7 @@ +

test_tonal_atonal (/data/project_tonal_atonal.yaml)

+Accuracy: 97.9651162791. Std: 1.48308195928. +Normalized accuracy: 97.775862069. Normalized std: 1.65925323088. + + + +

Predicted (%)

atonaltonalProportion
atonal96.55 140 atonal (out of 145) classified as atonal3.45 5 atonal (out of 145) classified as tonalatonal42.15 %
tonal1.01 2 tonal (out of 199) classified as atonal98.99 197 tonal (out of 199) classified as tonaltonal57.85 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param new file mode 100644 index 000000000..1dff19f60 --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param @@ -0,0 +1,11 @@ +evaluation: + nfold: 5 + type: nfoldcrossvalidation +model: + C: 9 + balanceClasses: false + classifier: svm + gamma: -11 + kernel: RBF + preprocessing: gaussianized + type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html new file mode 100644 index 000000000..5031b91e2 --- /dev/null +++ b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html @@ -0,0 +1,7 @@ +

test_voice_instrumental (/data/project_voice_instrumental.yaml)

+Accuracy: 93.2. Std: 1.72046505341. +Normalized accuracy: 93.2. Normalized std: 1.72046505341. + + + +

Predicted (%)

instrumentalvoiceProportion
instrumental93.20 466 instrumental (out of 500) classified as instrumental6.80 34 instrumental (out of 500) classified as voiceinstrumental50.00 %
voice6.80 34 voice (out of 500) classified as instrumental93.20 466 voice (out of 500) classified as voicevoice50.00 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_imitation_best_model.py b/models/sklearn/gaia_imitation_best_model.py new file mode 100644 index 000000000..8f9943f33 --- /dev/null +++ b/models/sklearn/gaia_imitation_best_model.py @@ -0,0 +1,103 @@ +from utils import load_yaml, FindCreateDirectory +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import cross_validate +from sklearn.model_selection import cross_val_predict +from transformation.transform import Transform +from sklearn.model_selection import KFold +from sklearn.svm import SVC + + +def display_scores(scores): + """ + + :param scores: + :return: + """ + print("Display scores:") + print("Scores: {}".format(scores)) + print("Mean: {}".format(scores.mean())) + print("Standard Deviation: {}".format(scores.std())) + + +def evaluate_gaia_imitation_model(config, class_name, X, y): + """ + + :param config: + :param class_name: + :param X: + :param y: + :return: + """ + gaia_params = load_yaml("gaia_best_models/jmp_results_{}.param".format(class_name)) + print("Gaia best model params: {}".format(gaia_params)) + + # params data transformation + preprocessing = gaia_params["model"]["preprocessing"] + + # params SVC + C = 2 ** gaia_params["model"]["C"] + gamma = 2 ** gaia_params["model"]["gamma"] + kernel = gaia_params["model"]["kernel"].lower() + balance_classes = gaia_params["model"]["balanceClasses"] + # TODO: declare a dictionary for class weights via automated labels balancing (unresponsive dataset) + if balance_classes is True: + class_weights = "balanced" + elif balance_classes is False: + class_weights = None + else: + print("Define a correct class weight value") + class_weights = None + n_fold = gaia_params["evaluation"]["nfold"] + + # Transform dataset + # pre-processing: data cleaning/enumerating/selecting descriptors + # pre-processing: scaling + print("Exports path for the training:") + exports_dir = "{}_{}".format(config.get("exports_directory"), class_name) + exports_path = FindCreateDirectory(exports_dir).inspect_directory() + print(exports_path) + # transformation of the data + X_transformed = Transform(config=config, + df=X, + process=preprocessing, + exports_path=exports_path, + mode="train").post_processing() + + print(X_transformed.columns) + print(X_transformed.head()) + + X_array_transformed = X_transformed.values + + inner_cv = KFold(n_splits=n_fold, + shuffle=config["gaia_kfold_shuffle"], + random_state=config["gaia_kfold_random_state"] + ) + + svm = SVC( + C=C, + kernel=kernel, + gamma=gamma, + class_weight=class_weights, + probability=config.get("svc_probability") + ) + + print("Evaluate the classifier with cross_val_score:") + scores = cross_val_score(estimator=svm, + X=X_array_transformed, + y=y, + scoring="accuracy", + cv=inner_cv, + n_jobs=config.get("parallel_jobs"), + verbose=config.get("verbose") + ) + + print() + print("Score results:") + display_scores(scores) + print() + print() + + +if __name__ == '__main__': + + evaluate_gaia_imitation_model() diff --git a/models/sklearn/logging_tool.py b/models/sklearn/logging_tool.py new file mode 100644 index 000000000..3b2a87de9 --- /dev/null +++ b/models/sklearn/logging_tool.py @@ -0,0 +1,107 @@ +""" +This file consists of the LoggerSetup class that is used for logging. + +Here, the LoggerSetup and its embedded setup_logger() method set up a new logger object with the related configurations. + + Typical usage example: + + logging_object = LoggerSetup(logger_name, logging_file_location, level_of_logging) + logger = logging_object.setup_logger() +""" +import logging +import os +from utils import load_yaml, FindCreateDirectory + +# # load yaml configuration file to a dict +# config_data = load_yaml() +# # If log directory does not exist, create one +# current_d = os.getcwd() +# if config_data["log_directory"] is None or config_data["log_directory"] is None: +# if not os.path.exists(os.path.join(current_d, "logs_dir")): +# os.makedirs(os.path.join(current_d, "logs_dir")) +# log_path = os.path.join(current_d, "logs_dir") +# else: +# log_path = FindCreateDirectory(config_data["log_directory"]).inspect_directory() + + +class LoggerSetup: + """It sets up a logging object. + + Attributes: + name: The name of the logger. + log_file: The path of the logging file export. + level: An integer that defines the logging level. + """ + def __init__(self, config, exports_path, name, train_class, mode, level=1): + """ + Inits the logger object with the corresponding parameters. + + Args: + name (str): The name of the logger. + log_file (str): The path the logging exports will be exported. + level (int): The level of the logging. Defaults to 1. + """ + self.config = config + self.exports_path = exports_path + self.name = name + self.train_class = train_class + self.mode = mode + self.level = level + + self.exports_dir = "" + self.logs_path = "" + + def setup_logger(self): + """ + Function to set up as many loggers as you want. It exports the logging results to a file + in the relevant path that is determined by the configuration file. + + :return: + """ + self.exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + self.logs_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "logs")).inspect_directory() + + # Create a custom logger + logger_object = logging.getLogger(self.name) + + # Create handlers + c_handler = logging.StreamHandler() + f_handler = logging.FileHandler(os.path.join(self.logs_path, "{}.log".format(self.name)), mode=self.mode) + + # Create formatters and add it to handlers + c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') + f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + c_handler.setFormatter(c_format) + f_handler.setFormatter(f_format) + + # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient + # when debugging and the code includes the logger initialization + if logger_object.hasHandlers(): + logger_object.handlers.clear() + + # Add handlers to the logger + logger_object.addHandler(c_handler) + logger_object.addHandler(f_handler) + + if self.level is None: + logger_object.setLevel(logging.INFO) + elif self.level is 0: + logger_object.setLevel(logging.DEBUG) + elif self.level is 1: + logger_object.setLevel(logging.INFO) + elif self.level is 2: + logger_object.setLevel(logging.WARNING) + elif self.level is 3: + logger_object.setLevel(logging.ERROR) + elif self.level is 4: + logger_object.setLevel(logging.CRITICAL) + else: + print('Please define correct one of the Debug Levels:\n' + '0: DEBUG\n' + '1: INFO\n' + '2: WARNING\n' + '3: ERROR\n' + '4: CRITICAL') + + return logger_object diff --git a/models/sklearn/predict.py b/models/sklearn/predict.py new file mode 100644 index 000000000..2ecbd6e73 --- /dev/null +++ b/models/sklearn/predict.py @@ -0,0 +1,166 @@ +import os +import requests +import argparse +from pprint import pprint +import joblib +import json +import pandas as pd +from utils import load_yaml, FindCreateDirectory +from transformation.utils_preprocessing import flatten_dict_full +from transformation.transform_predictions import TransformPredictions +from logging_tool import LoggerSetup + + +class Predict: + def __init__(self, config, track_low_level, log_level): + self.config = config + self.track_low_level = track_low_level + self.log_level = log_level + + self.class_name = "" + self.exports_path = "" + self.exports_dir = "" + self.best_model = "" + self.track_feats = dict() + + self.load_best_model() + # self.setting_logger() + self.logger = "" + # self.flat_dict() + self.df_track = pd.DataFrame() + self.list_track = [] + + def load_best_model(self): + self.class_name = self.config["class_name"] + self.exports_path = self.config["exports_path"] + self.exports_dir = "{}_{}".format(self.config["exports_directory"], self.class_name) + + # self.exports_path = os.path.join(self.exports_path, "{}_{}".format(self.exports_dir, self.class_name)) + best_model_path = os.path.join(self.exports_path, + self.exports_dir, + "best_model_{}.json".format(self.class_name)) + # best_model_path = os.path.join(self.exports_dir, "models", "model_grid_{}.pkl".format[""]) + with open(best_model_path) as json_file: + self.best_model = json.load(json_file) + + def preprocessing(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="predict_{}".format(self.class_name), + train_class=self.class_name, + mode="w", + level=self.log_level).setup_logger() + + self.logger.info("Best model:") + self.logger.info(self.best_model) + + self.logger.info("FLATTENING:") + try: + if 'beats_position' in self.track_low_level['rhythm']: + del self.track_low_level['rhythm']['beats_position'] + except Exception as e: + self.logger.warning("There is no 'rhythm' key in the low level data. Exception:", e) + + # data dictionary transformed to a fully flattened dictionary + self.track_feats = dict(flatten_dict_full(self.track_low_level)) + list_track = [] + list_track.append(self.track_feats) + self.logger.debug("DICT TO DATAFRAME:") + self.df_track = pd.DataFrame(data=list_track, columns=list_track[0].keys()) + self.logger.debug("TYPE of track structure: {}".format(type(self.df_track))) + # print(self.df_track) + # print("Shape of DF", self.df_track.shape) + + self.logger.info("PROCESSING:") + features_prepared = TransformPredictions(config=self.config, + df_feats=self.df_track, + process=self.best_model["preprocessing"], + train_class=self.class_name, + exports_path=self.exports_path, + log_level=self.log_level + ).post_processing() + self.logger.debug("Features shape after preparation: {}".format(features_prepared.shape)) + models_path = FindCreateDirectory(self.exports_path, + os.path.join(self.exports_dir, "models")).inspect_directory() + best_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(self.best_model["preprocessing"])) + clf_loaded = joblib.load(best_model_path) + predicted = clf_loaded.predict(features_prepared) + predicted_prob = clf_loaded.predict_proba(features_prepared) + self.logger.info("Prediction: {}".format(predicted)) + self.logger.info("Classes: {}".format(clf_loaded.classes_)) + self.logger.info("Prediction probabilities: {}".format(predicted_prob)) + predict_list = [] + for pred, pred_probability in zip(predicted, predicted_prob): + predict_dict = dict() + predict_dict[self.class_name] = pred + predict_dict["score"] = max(pred_probability) + predict_dict["probabilities"] = dict(zip(clf_loaded.classes_, pred_probability)) + + predict_list.append(predict_dict) + + self.logger.info("Predictions for the track:") + self.logger.info("{}".format(predict_list)) + self.logger.debug("Output (Return) predict_list") + + return predict_list + + +def prediction(exports_path, project_file, track_api, log_level): + # if empty, path is declared as the app's main directory + if exports_path is None: + exports_path = os.getcwd() + try: + project_data = load_yaml("{}.yaml".format(project_file)) + except Exception as e: + print('Unable to open project configuration file:', e) + raise + + response = requests.get(track_api) + + track = response.json() + if track["metadata"]["tags"]["artist"][0]: + print("Artist:", track["metadata"]["tags"]["artist"][0]) + if track["metadata"]["tags"]["album"][0]: + print("Track:", track["metadata"]["tags"]["album"][0]) + if track["metadata"]["tags"]["title"][0]: + print("Track:", track["metadata"]["tags"]["album"][0]) + + prediction_track = Predict(config=project_data, + track_low_level=track, + log_level=log_level + ) + prediction_track.preprocessing() + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='Predictions.') + + parser.add_argument('-p', '--path', + dest="exports_path", + help='Path where the project file is stored if not in the same file where the app is.') + + parser.add_argument('-f', '--file', + dest="project_file", + help='Name prefix of the project configuration file (.yaml) that is stored.', + required=True) + + parser.add_argument('-t', '--track', + dest="track_api", + help='Low-level data link from the AcousticBrainz API.', + required=True) + + parser.add_argument('-l', '--logging', + dest='log_level', + default=1, + help='Path where the result files will be stored.', + type=int) + + args = parser.parse_args() + + prediction(exports_path=args.exports_path, + project_file=args.project_file, + track_api=args.track_api, + log_level=args.log_level) diff --git a/models/sklearn/transformation/__init__.py b/models/sklearn/transformation/__init__.py new file mode 100644 index 000000000..7c68785e9 --- /dev/null +++ b/models/sklearn/transformation/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/models/sklearn/transformation/load_groung_truth.py b/models/sklearn/transformation/load_groung_truth.py new file mode 100644 index 000000000..fdd589551 --- /dev/null +++ b/models/sklearn/transformation/load_groung_truth.py @@ -0,0 +1,257 @@ +import os +import yaml +import pandas as pd +from pprint import pprint +from termcolor import colored +import random +from utils import load_yaml, FindCreateDirectory +from transformation.load_low_level import FeaturesDf +from logging_tool import LoggerSetup + + +class ListGroundTruthFiles: + """ + + """ + def __init__(self, config): + """ + + :param config: + """ + self.config = config + self.dataset_dir = "" + self.class_dir = "" + + def list_gt_filenames(self): + """ + + :return: + """ + self.dataset_dir = self.config.get("ground_truth_directory") + self.class_dir = self.config.get("class_dir") + path = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "metadata") + ground_truth_list = [filename for filename in os.listdir(os.path.join(path)) + if filename.startswith("groundtruth")] + return ground_truth_list + + +class GroundTruthLoad: + """ + The Ground Truth data object which contains features to: + * counter the JSON low-level data + * Todo: create logger object + + Attributes: + """ + def __init__(self, config, gt_filename, exports_path, log_level): + """ + + :param config: + :param gt_filename: + """ + self.config = config + self.gt_filename = gt_filename + self.exports_path = exports_path + self.log_level = log_level + + self.logger = "" + self.class_dir = "" + self.ground_truth_data = {} + self.labeled_tracks = {} + self.train_class = "" + self.dataset_dir = "" + self.tracks = [] + + self.load_local_ground_truth() + + def load_local_ground_truth(self): + """ + Loads the the ground truth file. + * The directory with the dataset should be located inside the app folder location. + :return: + """ + + self.dataset_dir = self.config.get("ground_truth_directory") + self.class_dir = self.config.get("class_dir") + with open(os.path.join(os.getcwd(), "{}/{}/metadata/{}".format( + self.dataset_dir, self.class_dir, self.gt_filename)), "r") as stream: + try: + self.ground_truth_data = yaml.safe_load(stream) + print("Ground truth file loaded.") + except yaml.YAMLError as exc: + print("Error in loading the ground truth file.") + print(exc) + + def export_train_class(self): + """ + + :return: + """ + self.train_class = self.ground_truth_data["className"] + print("EXPORT CLASS NAME: {}".format(self.train_class)) + return self.train_class + + def export_gt_tracks(self): + self.labeled_tracks = self.ground_truth_data["groundTruth"] + tracks_list = [] + for track, label in self.labeled_tracks.items(): + tracks_list.append((track, label)) + print(colored("SEED is set to: {}".format(self.config.get("seed"), "cyan"))) + random.seed(a=self.config.get("seed")) + random.shuffle(tracks_list) + return tracks_list + + def check_ground_truth_data(self): + """ + Todo: description + :return: + """ + pprint(self.ground_truth_data) + + def check_ground_truth_info(self): + """ + Todo: description + :return: + """ + len(self.ground_truth_data["groundTruth"].keys()) + print("Ground truth data class/target: {}".format(self.ground_truth_data["className"])) + print("Label tracks: {}".format(type(self.labeled_tracks))) + print("Ground truth data keys - tracks: {}".format(len(self.ground_truth_data["groundTruth"].keys()))) + + def check_tracks_folders(self): + """ + Todo: function explanation docstring + :return: + """ + if len(self.labeled_tracks.keys()) is not 0: + folders = [] + for key in self.labeled_tracks: + key = key.split('/') + path_sub_dir = '/'.join(key[:-1]) + folders.append(path_sub_dir) + folders = set(folders) + folders = list(folders) + folders.sort() + print("Directories that contain the low-level JSON data:") + print("{}".format(folders)) + + def count_json_low_level_files(self): + """ + Prints the JSON low-level data that is contained inside the dataset directory (the dataset + directory is declared in configuration file). + :return: + """ + counter = 0 + for root, dirs, files in os.walk(os.path.join(os.getcwd(), self.dataset_dir)): + for file in files: + if file.endswith(".json"): + # print(os.path.join(root, file)) + counter += 1 + print("counted json files: {}".format(counter)) + + +class DatasetExporter: + def __init__(self, config, tracks_list, train_class, exports_path, log_level): + self.config = config + self.tracks_list = tracks_list + self.train_class = train_class + self.exports_path = exports_path + self.log_level = log_level + + self.dataset_dir = "" + self.class_dir = "" + self.df_tracks = pd.DataFrame() + self.df_feats = pd.DataFrame() + self.y = [] + self.logger = "" + + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="dataset_exports_transformations_{}".format(self.train_class), + train_class=self.train_class, + mode="w", + level=self.log_level).setup_logger() + + def create_df_tracks(self): + """ + Creates the pandas DataFrame with the tracks. + Todo: more comments + :return: + DataFrame or None: a DataFrame with the tracks included in the ground truth yaml file containing the track name, + the path to load the JSON low-level data, the label, etc. Else, it returns None. + """ + + self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") + # the class name from the ground truth data that is the target + self.dataset_dir = self.config.get("ground_truth_directory") + self.class_dir = self.config.get("class_dir") + print('DATASET-DIR', self.dataset_dir) + print('CLASS NAME PATH', self.class_dir) + # the path to the "features" directory that contains the rest of the low-level data sub-directories + path_features = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "features") + # check if the "features" directory is empty or contains the "mp3" or the "orig" sub-directory + low_level_dir = "" + if len(os.listdir(path_features)) == 0: + print("Directory is empty") + self.logger.warning("Directory is empty.") + else: + print("Directory is not empty") + self.logger.info("Directory is not empty") + directory_contents = os.listdir(path_features) + if "mp3" in directory_contents: + low_level_dir = "mp3" + elif "orig" in directory_contents: + low_level_dir = "orig" + else: + low_level_dir = "" + print("There is no valid low-level data inside the features directory") + self.logger.warning("There is no valid low-level data inside the features directory") + # print which directory contains the low-level sub-directories (if exist) + self.logger.info("Low-level directory name that contains the data: {}".format(low_level_dir)) + # path to the low-level data sub-directories + path_low_level = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "features", low_level_dir) + self.logger.info("Path of low level data: {}".format(path_low_level)) + # create a list with dictionaries that contain the information from each track in + if low_level_dir != "": + self.df_tracks = pd.DataFrame(data=self.tracks_list, columns=["track", self.train_class]) + self.logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) + self.logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") + self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + + self.logger.debug("Drop rows with NULL values if they exist..") + if self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape[0] != 0: + self.df_tracks.dropna(inplace=True) + self.logger.debug("Check if there are NULL values after the cleaning process:") + self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + self.logger.debug("Re-index the tracks DF..") + self.df_tracks = self.df_tracks.reset_index(drop=True) + else: + self.logger.info("There are no NULL values found.") + + # export shuffled tracks to CSV format + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + tracks_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "tracks_csv_format")).inspect_directory() + self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) + self.logger.debug("DF INFO:") + self.logger.debug("{}".format(self.df_tracks.info())) + self.logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( + self.df_tracks.select_dtypes(include=['object']).columns)) + + self.df_feats = FeaturesDf(df_tracks=self.df_tracks, + train_class=self.train_class, + path_low_level=path_low_level, + config=self.config, + exports_path=self.exports_path, + log_level=self.log_level, + ).create_low_level_df() + + self.y = self.df_tracks[self.train_class].values + self.logger.info("Features, Labels, and Tracks are exported successfully..") + return self.df_feats, self.y, self.df_tracks["track"].values + else: + return None, None, None diff --git a/models/sklearn/transformation/load_low_level.py b/models/sklearn/transformation/load_low_level.py new file mode 100644 index 000000000..dd5ab1d34 --- /dev/null +++ b/models/sklearn/transformation/load_low_level.py @@ -0,0 +1,106 @@ +import os +import json +import pandas as pd +from transformation.utils_preprocessing import flatten_dict_full +from logging_tool import LoggerSetup + + +class FeaturesDf: + """ + Features DataFrame object by the JSON low-level data. + Attributes: + df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, + label, etc. + """ + def __init__(self, df_tracks, train_class, path_low_level, config, exports_path, log_level): + self.df_tracks = df_tracks + self.train_class = train_class + self.path_low_level = path_low_level + self.config = config + self.exports_path = exports_path + self.log_level = log_level + self.list_feats_tracks = [] + self.counter_items_transformed = 0 + self.df_feats_tracks = pd.DataFrame() + self.df_feats_label = pd.DataFrame() + + self.logger = "" + + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="dataset_exports_transformations_{}".format(self.train_class), + train_class=self.train_class, + mode="a", + level=self.log_level).setup_logger() + + def create_low_level_df(self): + """ + Creates the low-level DataFrame. Cleans also the low-level data from the unnecessary features before creating + the DF. + + :return: + DataFrame: low-level features Daa=taFrame from all the tracks in the collection. + """ + self.logger.info("---- CREATE LOW LEVEL DATAFRAME ----") + # clear the list if it not empty + self.list_feats_tracks.clear() + for index, row in self.df_tracks.iterrows(): + path_low_data = os.path.join(self.path_low_level, "{}.json".format(row["track"])) + try: + f = open(path_low_data) + data_feats_item = json.load(f, strict=False) + except Exception as e: + print("Exception occurred in loading file:", e) + self.logger.warning("Exception occurred in loading file: {}".format(e)) + # remove unnecessary features data + try: + if 'beats_position' in data_feats_item['rhythm']: + del data_feats_item['rhythm']['beats_position'] + except Exception as e: + print("There is no 'rhythm' key in the low level data. Exception:", e) + + # data dictionary transformed to a fully flattened dictionary + data_feats_item = flatten_dict_full(data_feats_item) + + # append to a full tracks features pandas df + self.list_feats_tracks.append(dict(data_feats_item)) + + self.counter_items_transformed += 1 + + # The dictionary's keys list is transformed to type + self.df_feats_tracks = pd.DataFrame(self.list_feats_tracks, columns=list(self.list_feats_tracks[0].keys())) + self.logger.info("COLUMNS CONTAIN OBJECTS: \n{}".format( + self.df_feats_tracks.select_dtypes(include=['object']).columns)) + self.logger.info("Exporting low-level data (dataframe)") + return self.df_feats_tracks + + def check_processing_info(self): + """ + Prints some information about the low-level data to DataFrame transformation step and its middle processes. + :return: + """ + self.logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) + # The type of the dictionary's keys list is: + self.logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) + # The dictionary's keys list is transformed to type + self.logger.info('Confirm the type of list transformation of features keys: {}' + .format(type(list(self.list_feats_tracks[0].keys())))) + + def export_tracks_feats_df(self): + """ + :return: + DataFrame: The tracks with all the ground truth data and the corresponding low-level data flattened. + """ + self.logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") + self.logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) + self.logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) + + self.df_feats_label = pd.concat([self.df_tracks, self.df_feats_tracks], axis=1) + self.logger.info("FULL: {}".format(self.df_feats_label.shape)) + self.logger.info("COLUMNS CONTAIN OBJECTS: {}" + .format(self.df_feats_label.select_dtypes(include=['object']).columns)) + return self.df_feats_label diff --git a/models/sklearn/transformation/transform.py b/models/sklearn/transformation/transform.py new file mode 100644 index 000000000..c0ba992c2 --- /dev/null +++ b/models/sklearn/transformation/transform.py @@ -0,0 +1,265 @@ +import pandas as pd +from termcolor import colored +import collections +import joblib +import os + +from utils import FindCreateDirectory +from transformation.utils_preprocessing import list_descr_handler +from transformation.utils_preprocessing import feats_selector_list +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, QuantileTransformer +from sklearn.pipeline import FeatureUnion +from sklearn.pipeline import Pipeline +from logging_tool import LoggerSetup + + +# avoid the module's method call deprecation +try: + collectionsAbc = collections.abc +except AttributeError: + collectionsAbc = collections + + +class Transform: + def __init__(self, config, df_feats, process, train_class, exports_path, log_level): + self.config = config + self.df_feats = df_feats + self.process = process + self.train_class = train_class + self.exports_path = exports_path + self.log_level = log_level + + self.list_features = [] + self.feats_cat_list = [] + self.feats_num_list = [] + self.df_cat = pd.DataFrame() + self.df_num = pd.DataFrame() + + self.feats_prepared = [] + self.logger = "" + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="dataset_exports_transformations_{}".format(self.train_class), + train_class=self.train_class, + mode="a", + level=self.log_level).setup_logger() + + def post_processing(self): + print(colored("PROCESS: {}".format(self.process), "cyan")) + self.logger.debug("PROCESS: {}".format(self.process)) + self.logger.debug("Process: {}".format(self.config["processing"][self.process])) + # list_preprocesses = [] + + self.list_features = list(self.df_feats.columns) + + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + models_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + + # clean list + print(colored("Cleaning..", "yellow")) + self.logger.info("Cleaning..") + cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) + feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) + self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] + self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) + + # remove list + print(colored("Removing unnecessary features..", "yellow")) + self.logger.info("Removing unnecessary features..") + if self.config["processing"][self.process][0]["transfo"] == "remove": + remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) + feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) + self.list_features = [x for x in self.list_features if x not in feats_remove_list] + self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) + + # enumerate list + print(colored("Split numerical / categorical features..", "yellow")) + if self.config["processing"][self.process][1]["transfo"] == "enumerate": + enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) + self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) + self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] + self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + + # BASIC + if self.process == "basic": + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # LOW-LEVEL or MFCC + if self.process == "lowlevel" or self.process == "mfcc": + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # NOBANDS + if self.process == "nobands": + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + feats_rem_list = feats_selector_list(self.df_feats, sel_list) + self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # NORMALIZED + if self.process == "normalized": + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)), + ('minmax_scaler', MinMaxScaler()), + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # GAUSSIANIZED + if self.process == "gaussianized": + gauss_list = list_descr_handler(self.config["processing"][self.process][3]["params"]["descriptorNames"]) + feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) + feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] + + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + self.logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) + + num_norm_pipeline = Pipeline([ + ("selector_num", DataFrameSelector(self.feats_num_list)), + ("minmax_scaler", MinMaxScaler()) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_normalize_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_norm_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_normalize_pipeline.fit_transform(self.df_feats) + self.logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) + # save pipeline + joblib.dump(full_normalize_pipeline, + os.path.join(models_path, "full_normalize_pipeline_{}.pkl".format(self.process))) + self.df_feats = pd.DataFrame(data=self.feats_prepared) + columns = list(self.df_feats.columns) + # print(columns) + select_rename_list = columns[:len(self.feats_num_list)] + select_rename_list = self.feats_num_list + select_no_rename_list = columns[len(self.feats_num_list):] + print(select_no_rename_list) + new_feats_columns = select_rename_list + select_no_rename_list + self.df_feats.columns = new_feats_columns + self.logger.debug("Normalized Features DF:") + self.logger.debug("\n{}".format(self.df_feats)) + self.logger.debug("Shape: {}".format(self.df_feats.shape)) + + feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] + + num_gauss_pipeline = Pipeline([ + ("gauss_sel_num", DataFrameSelector(feats_num_gauss_list)), + ("gauss_scaler", QuantileTransformer(n_quantiles=1000)) + ]) + + num_no_gauss_pipeline = Pipeline([ + ("gauss_sel_num", DataFrameSelector(feats_no_gauss_list)) + ]) + + full_gauss_pipeline = FeatureUnion(transformer_list=[ + ("num_gauss_pipeline", num_gauss_pipeline), + ("num_no_gauss_pipeline", num_no_gauss_pipeline) + ]) + + self.feats_prepared = full_gauss_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_gauss_pipeline, + os.path.join(models_path, "full_gauss_pipeline_{}.pkl".format(self.process))) + + return self.feats_prepared + + +# Create a class to select numerical or categorical columns +class DataFrameSelector(BaseEstimator, TransformerMixin): + def __init__(self, attribute_names): + self.attribute_names = attribute_names + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X[self.attribute_names].values \ No newline at end of file diff --git a/models/sklearn/transformation/transform_predictions.py b/models/sklearn/transformation/transform_predictions.py new file mode 100644 index 000000000..514c4e616 --- /dev/null +++ b/models/sklearn/transformation/transform_predictions.py @@ -0,0 +1,181 @@ +import pandas as pd +from termcolor import colored +import collections +import joblib +import os + +from utils import FindCreateDirectory +from transformation.utils_preprocessing import list_descr_handler +from transformation.utils_preprocessing import feats_selector_list +from sklearn.base import BaseEstimator, TransformerMixin +from logging_tool import LoggerSetup + +# avoid the module's method call deprecation +try: + collectionsAbc = collections.abc +except AttributeError: + collectionsAbc = collections + + +class TransformPredictions: + def __init__(self, config, df_feats, process, train_class, exports_path, log_level): + self.config = config + self.df_feats = df_feats + self.process = process + self.train_class = train_class + self.exports_path = exports_path + self.log_level = log_level + + self.logger = "" + self.list_features = [] + self.feats_cat_list = [] + self.feats_num_list = [] + + self.feats_prepared = [] + + self.setting_logger() + + def setting_logger(self): + # set up logger + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="predict_{}".format(self.train_class), + train_class=self.train_class, + mode="a", + level=self.log_level).setup_logger() + + def post_processing(self): + print(colored("PROCESS: {}".format(self.process), "cyan")) + # list_preprocesses = [] + + self.logger.debug("Track Features - Low Level: {}".format(self.df_feats)) + self.logger.debug("Shape of DF: {}".format(self.df_feats.shape)) + + self.list_features = list(self.df_feats.columns) + + exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + models_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + + # clean list + print(colored("Cleaning..", "yellow")) + cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) + self.logger.debug("cleaning list: {}".format(cleaning_conf_list)) + feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) + self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] + self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features), "blue")) + + # remove list + print(colored("Removing unnecessary features..", "yellow")) + if self.config["processing"][self.process][0]["transfo"] == "remove": + remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) + feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) + self.list_features = [x for x in self.list_features if x not in feats_remove_list] + self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features), "blue")) + + # enumerate list + print(colored("Removing unnecessary features..", "yellow")) + if self.config["processing"][self.process][1]["transfo"] == "enumerate": + enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) + self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) + self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] + self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + + # BASIC + if self.process == "basic": + print(colored("Process doing: {}".format(self.process), "green")) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # LOW-LEVEL or MFCC + if self.process == "lowlevel" or self.process == "mfcc": + print(colored("Process doing: {}".format(self.process), "green")) + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # NOBANDS + if self.process == "nobands": + print(colored("Process doing: {}".format(self.process), "green")) + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + feats_rem_list = feats_selector_list(self.df_feats, sel_list) + self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # NORMALIZED + if self.process == "normalized": + print(colored("Process doing: {}".format(self.process), "green")) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # GAUSSIANIZED + if self.process == "gaussianized": + print(colored("Process doing: {}".format(self.process), "green")) + gauss_list = list_descr_handler(self.config["processing"][self.process][3]["params"]["descriptorNames"]) + feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) + feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] + + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + + # load normalization pipeline + # full_pipeline = joblib.load(os.path.join(exports_dir, "full_pipeline_{}.pkl".format(self.process))) + full_normalize_pipeline = joblib.load(os.path.join(models_path, + "full_normalize_pipeline_{}.pkl".format(self.process))) + # normalize + self.feats_prepared = full_normalize_pipeline.transform(self.df_feats) + + # transform numpy array to pandas DF for guassianizing + self.df_feats = pd.DataFrame(data=self.feats_prepared) + columns = list(self.df_feats.columns) + # print(columns) + select_rename_list = columns[:len(self.feats_num_list)] + select_rename_list = self.feats_num_list + select_no_rename_list = columns[len(self.feats_num_list):] + self.logger.debug("Selected no rename list: {}".format(select_no_rename_list)) + new_feats_columns = select_rename_list + select_no_rename_list + self.df_feats.columns = new_feats_columns + self.logger.debug("Normalized Features DF:") + self.logger.debug("\n{}".format(self.df_feats)) + self.logger.debug("Shape: {}".format(self.df_feats.shape)) + # feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] + + # load guassianization pipeline + full_gauss_pipeline = joblib.load(os.path.join(models_path, + "full_gauss_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_gauss_pipeline.transform(self.df_feats) + + return self.feats_prepared + + +# Create a class to select numerical or categorical columns +class DataFrameSelector(BaseEstimator, TransformerMixin): + def __init__(self, attribute_names): + self.attribute_names = attribute_names + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X[self.attribute_names].values \ No newline at end of file diff --git a/models/sklearn/transformation/utils_preprocessing.py b/models/sklearn/transformation/utils_preprocessing.py new file mode 100644 index 000000000..9375e739f --- /dev/null +++ b/models/sklearn/transformation/utils_preprocessing.py @@ -0,0 +1,69 @@ +import os +import re +import pandas as pd +import collections +from sklearn.preprocessing import OneHotEncoder +import joblib +from utils import load_yaml, FindCreateDirectory, TrainingProcesses + + +def flatten_dict_full(dictionary, sep="_"): + """ + + :param dictionary: + :param sep: + :return: + """ + obj = collections.OrderedDict() + + def recurse(t, parent_key=""): + if isinstance(t, list): + for i in range(len(t)): + recurse(t[i], parent_key + sep + str(i) if parent_key else str(i)) + elif isinstance(t, dict): + for k, v in t.items(): + recurse(v, parent_key + sep + k if parent_key else k) + else: + obj[parent_key] = t + + recurse(dictionary) + + return obj + + +def list_descr_handler(descr_list): + """ + + :param descr_list: + :return: + """ + keys_list_handle = [] + for item in descr_list: + if item.endswith(".*"): + item = item.replace(".*", "_") + elif item.startswith("*."): + item = item.replace("*.", "_") + else: + item = item.replace("*", "") + item = item.replace(".", "_") + keys_list_handle.append(item) + return keys_list_handle + + +def feats_selector_list(df_feats_columns, feats_select_list): + """ + + :param df_feats_columns: + :param feats_select_list: + :return: + """ + columns_list = list(df_feats_columns) + columns_select_list = [] + counter_feats = 0 + for item in feats_select_list: + for sel_item in columns_list: + if re.search(item, sel_item): + columns_select_list.append(sel_item) + counter_feats += 1 + print("features selected: {}".format(counter_feats)) + return columns_select_list diff --git a/models/sklearn/utils.py b/models/sklearn/utils.py new file mode 100644 index 000000000..932eaa02d --- /dev/null +++ b/models/sklearn/utils.py @@ -0,0 +1,166 @@ +import os +from pprint import pprint + +def load_yaml(path_file): + """ + Todo: add comments, docstring info, etc. + :return: + """ + try: + import yaml + with open(os.path.join(os.path.abspath(os.getcwd()), path_file)) as file: + config_data = yaml.load(file, Loader=yaml.FullLoader) + # print(type(config_data)) + # print(config_data) + if isinstance(config_data, dict): + return config_data + else: + return None + except ImportError: + print('WARNING: could not import yaml module') + return None + + +class DfChecker: + """ + + """ + def __init__(self, df_check): + """ + + :param df_check: + """ + self.df_check = df_check + + def check_df_info(self): + """ + Prints information about the Pandas DataFrame that is generated from the relevant process. + :return: + """ + print('Features DataFrame head:') + print(self.df_check.head()) + print() + print('Information:') + print(self.df_check.info()) + print() + print('Shape:', self.df_check.shape) + print('Number of columns:', len(list(self.df_check.columns))) + + if 'category' in self.df_check.columns: + print('Track categories distribution:') + print(self.df_check['category'].value_counts()) + + +class FindCreateDirectory: + """ + + """ + def __init__(self, exports_path, directory): + """ + + :param directory: + """ + self.exports_path = exports_path + self.directory = directory + + def inspect_directory(self): + """ + + :return: + """ + # find dynamically the current script directory + # path_app = os.path.join(os.path.abspath(os.getcwd())) + full_path = os.path.join(self.exports_path, self.directory) + # create path directories if not exist --> else return the path + if not os.path.exists(full_path): + os.makedirs(full_path) + # print('Path {}:'.format(self.directory), full_path) + return full_path + + +class LogsDeleter: + def __init__(self, config, train_class): + self.config = config + self.train_class = train_class + + def delete_logs(self): + # delete logs for specific model and class on a new run + if self.config["delete_logs"] is True: + print("Evaluation logs deletion is turned to ON.") + dir_name = os.path.join(os.getcwd(), "evaluations") + evaluations_list = os.listdir(dir_name) + for item in evaluations_list: + if item.endswith(".txt"): + if item.startswith("{}_{}".format(self.train_class, self.config["train_kind"])): + os.remove(os.path.join(dir_name, item)) + print("Previous evaluation logs deleted successfully.") + else: + print("Evaluation logs deletion is turned to OFF.") + + +def change_weights_values(i): + if i is True: + return "balanced" + elif i is False: + return None + return i + + +class TrainingProcesses: + def __init__(self, config): + self.config = config + + def training_processes(self): + """ + + :return: + processes: A list of the processes that have been identified with the corresponding parameter grid + """ + evaluations = self.config["evaluations"]["nfoldcrossvalidation"] + print("Evaluations countered: {}".format(len(evaluations))) + evaluation_counter = 0 + trainings_counted = 0 + processes = [] + for evaluation in evaluations: + for nfold_number in evaluation["nfold"]: + classifiers = self.config["classifiers"]["svm"] + for classifier in classifiers: + for pre_processing in classifier["preprocessing"]: + for clf_type in classifier["type"]: + if clf_type == "C-SVC": + process_dict = dict() + process_dict["Evaluation"] = evaluation_counter + # classifier + process_dict["classifier"] = clf_type + # pre-processing + process_dict["preprocess"] = pre_processing + # kernel + kernel = classifier["kernel"] + process_dict["kernel"] = [i.lower() for i in kernel] + # C + c = classifier["C"] + process_dict["C"] = [2 ** x for x in c] # 2 ** c + # gamma + gamma = classifier["gamma"] + process_dict["gamma"] = [2 ** x for x in gamma] # 2 ** gamma + # class weights + balance_classes = classifier["balanceClasses"] + process_dict["balanceClasses"] = [change_weights_values(i) for i in balance_classes] + processes.append(process_dict) + # n_fold + process_dict["n_fold"] = nfold_number + # increase counter by 1 + trainings_counted += 1 + # increase evaluation counter by 1 + evaluation_counter += 1 + + print("Trainings to be applied: {}".format(trainings_counted)) + + return processes + + +if __name__ == '__main__': + conf_data = load_yaml() + print(conf_data) + + test = FindCreateDirectory('exports').inspect_directory() From 3e5ec508fdda2c9e0b221f50a40d27a25e4a0fca Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 3 Aug 2020 15:35:49 +0300 Subject: [PATCH 02/64] PEP 8 issues fix PEP 8 issues fix 02 remove gaia best models params issue fixed - tracks in gt file do not exist in low-level predict with MBID only (not the whole API low-level url)) add sklearn requirements.txt create gaia evaluation method change name of the model directory load updated files change the path saves - reduce print/logging messages in evaluation requirements.txt --> lowercase remove requirements.txt add requirements.txt with lowercase use of now.isoformat() reports creation - datetime to start of the report, code improvements review updates 01 syntax for documentation strings as in AB fixed ground_truth file typo fixed use of os.makedirs(full_path, exist_ok=True) create results_dict = {....} in a single call simplify processing step dict creation - documentation added add lower-case in all process steps params save best model after training to the whole data - add README split evaluation in separate methods split grid classification into separate methods single logger setup change logging set from int to str dynamic project yaml saving - update readme predict section add new arguments in readme readme - add predict invoking readme MBID in prediction create project - default values declaration - documentation relative imports update predict readme and script arguments import classification_project readme path file required readme, how ti works session - remove requirements jupyter notebook requirements.txt - add requests, remove tensorflow readme - add how training and predicting modes work add new dockerfile for ML tool readme - update doc only for parameters add exports_directory in config when specified in arguments sklearn model inserted in AB --- dataset_eval/evaluate.py | 42 ++- docker/Dockerfile.py3 | 120 ++++++ models/sklearn/README.md | 232 ++++++++++++ models/sklearn/__init__.py | 2 +- models/sklearn/classification/__init__.py | 2 +- .../classification/classification_task.py | 73 ++-- .../classification_task_manager.py | 59 +-- .../sklearn/classification/classifierGRID.py | 153 -------- ...classifierBASIC.py => classifier_basic.py} | 18 +- .../sklearn/classification/classifier_grid.py | 177 +++++++++ .../classification/confusion_matrix.py | 20 - models/sklearn/classification/evaluation.py | 352 ++++++++++-------- .../classification/report_files_export.py | 30 +- models/sklearn/classification/train_class.py | 42 ++- .../sklearn/create_classification_project.py | 129 ------- .../jmp_results_danceability.param | 11 - .../jmp_results_danceability.results.html | 7 - .../jmp_results_tonal_atonal.param | 11 - .../jmp_results_tonal_atonal.results.html | 7 - .../jmp_results_voice_instrumental.param | 11 - ...mp_results_voice_instrumental.results.html | 7 - models/sklearn/gaia_imitation_best_model.py | 103 ----- models/sklearn/helper_functions/__init__.py | 1 + .../{ => helper_functions}/logging_tool.py | 38 +- .../sklearn/{ => helper_functions}/utils.py | 120 +++--- models/sklearn/model/__init__.py | 1 + .../sklearn/model/classification_project.py | 138 +++++++ .../{ => model}/configuration_template.yaml | 25 +- models/sklearn/{ => model}/predict.py | 80 ++-- .../{REQUIREMENTS.txt => requirements.txt} | 6 +- models/sklearn/transformation/__init__.py | 2 +- ...d_groung_truth.py => load_ground_truth.py} | 157 ++++---- .../sklearn/transformation/load_low_level.py | 31 +- models/sklearn/transformation/transform.py | 18 +- .../transformation/transform_predictions.py | 16 +- .../transformation/utils_preprocessing.py | 27 +- requirements.txt | 2 +- 37 files changed, 1278 insertions(+), 992 deletions(-) create mode 100644 docker/Dockerfile.py3 create mode 100644 models/sklearn/README.md delete mode 100644 models/sklearn/classification/classifierGRID.py rename models/sklearn/classification/{classifierBASIC.py => classifier_basic.py} (53%) create mode 100644 models/sklearn/classification/classifier_grid.py delete mode 100644 models/sklearn/classification/confusion_matrix.py delete mode 100644 models/sklearn/create_classification_project.py delete mode 100644 models/sklearn/gaia_best_models/jmp_results_danceability.param delete mode 100644 models/sklearn/gaia_best_models/jmp_results_danceability.results.html delete mode 100644 models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param delete mode 100644 models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html delete mode 100644 models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param delete mode 100644 models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html delete mode 100644 models/sklearn/gaia_imitation_best_model.py create mode 100644 models/sklearn/helper_functions/__init__.py rename models/sklearn/{ => helper_functions}/logging_tool.py (75%) rename models/sklearn/{ => helper_functions}/utils.py (54%) create mode 100644 models/sklearn/model/__init__.py create mode 100644 models/sklearn/model/classification_project.py rename models/sklearn/{ => model}/configuration_template.yaml (88%) rename models/sklearn/{ => model}/predict.py (64%) rename models/sklearn/{REQUIREMENTS.txt => requirements.txt} (76%) rename models/sklearn/transformation/{load_groung_truth.py => load_ground_truth.py} (59%) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 4691c0d46..f4ce5d594 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -18,7 +18,7 @@ import utils.path from dataset_eval import artistfilter from dataset_eval import gaia_wrapper - +from models.sklearn.model.classification_project import create_classification_project SLEEP_DURATION = 30 # number of seconds to wait between runs @@ -60,21 +60,9 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): with open(groundtruth_path, "w") as f: yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) - logging.info("Training model...") - results = gaia_wrapper.train_model( - project_dir=eval_location, - groundtruth_file=groundtruth_path, - filelist_file=filelist_path, - ) - logging.info("Saving results...") - save_history_file(storage_dir, results["history_path"], eval_job["id"]) - db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ - "project_path": eval_location, - "parameters": results["parameters"], - "accuracy": results["accuracy"], - "confusion_matrix": results["confusion_matrix"], - "history_path": results["history_path"], - })) + logging.info("Training GAIA model...") + evaluate_gaia(eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) + db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) @@ -95,6 +83,28 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): shutil.rmtree(temp_dir) +def evaluate_gaia(eval_location, groundtruth_path, filelist_path, storage_dir, eval_job): + results = gaia_wrapper.train_model( + project_dir=eval_location, + groundtruth_file=groundtruth_path, + filelist_file=filelist_path, + ) + logging.info("Saving results...") + save_history_file(storage_dir, results["history_path"], eval_job["id"]) + db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ + "project_path": eval_location, + "parameters": results["parameters"], + "accuracy": results["accuracy"], + "confusion_matrix": results["confusion_matrix"], + "history_path": results["history_path"], + })) + + +def evaluate_sklearn(eval_location, groundtruth_path, filelist_path, storage_dir, eval_job): + # create_classification_project(ground_truth_directory=groundtruth_path) + pass + + def create_groundtruth_dict(name, datadict): groundtruth = { "type": "unknown", # TODO: See if that needs to be modified. diff --git a/docker/Dockerfile.py3 b/docker/Dockerfile.py3 new file mode 100644 index 000000000..689ae742a --- /dev/null +++ b/docker/Dockerfile.py3 @@ -0,0 +1,120 @@ +FROM metabrainz/python:3.7 AS acousticbrainz-sklearn + +# Dockerize +ENV DOCKERIZE_VERSION v0.6.1 +RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \ + && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz + +# Install dependencies +# Hadolint DL4006 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] +# Node +RUN wget -q -O - https://deb.nodesource.com/setup_12.x | bash - && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + git \ + ipython \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libavresample-dev \ + libffi-dev \ + libfftw3-dev \ + libpq-dev \ + libsamplerate0-dev \ + libqt4-dev \ + libssl-dev \ + libtag1-dev \ + libxml2-dev \ + libxslt1-dev \ + libyaml-dev \ + nodejs \ + pkg-config \ + pxz \ + python-dev \ + python-numpy-dev \ + python-numpy \ + swig2.0 \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir /code +RUN mkdir /data +WORKDIR /code + +RUN groupadd --gid 901 acousticbrainz +RUN useradd --create-home --shell /bin/bash --uid 901 --gid 901 acousticbrainz + +RUN chown acousticbrainz:acousticbrainz /code + +# Python dependencies +RUN mkdir /code/docs/ && chown acousticbrainz:acousticbrainz /code/docs/ +COPY --chown=acousticbrainz:acousticbrainz docs/requirements.txt /code/docs/requirements.txt +COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + + +FROM acousticbrainz-base AS acousticbrainz-dev + +COPY --chown=acousticbrainz:acousticbrainz requirements_development.txt /code/requirements_development.txt +RUN pip install --no-cache-dir -r requirements_development.txt + +# install sklearn ML tool requirements +RUN mkdir /code/models/sklearn/ && chown acousticbrainz:acousticbrainz /code/models/sklearn/ +COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + + +# We don't copy code to the dev image because it's added with a volume mount +# during development, however it's needed for tests. Add it here. +FROM acousticbrainz-dev AS acousticbrainz-test + +COPY . /code + + +FROM acousticbrainz-base AS acousticbrainz-prod +USER root + +RUN pip install --no-cache-dir uWSGI==2.0.17.1 + +RUN mkdir /cache_namespaces && chown -R acousticbrainz:acousticbrainz /cache_namespaces + +# Consul template service is already set up, just need to copy the configuration +COPY ./docker/consul-template.conf /etc/consul-template.conf + +# runit service files +# All services are created with a `down` file, preventing them from starting +# rc.local removes the down file for the specific service we want to run in a container +# http://smarden.org/runit/runsv.8.html + +# uwsgi service files +COPY ./docker/uwsgi/uwsgi.service /etc/service/uwsgi/run +COPY ./docker/uwsgi/uwsgi.ini /etc/uwsgi/uwsgi.ini +RUN touch /etc/service/uwsgi/down + +# hl_extractor service files +COPY ./docker/hl_extractor/hl_extractor.service /etc/service/hl_extractor/run +RUN touch /etc/service/hl_extractor/down + +# dataset evaluator service files +COPY ./docker/dataset_eval/dataset_eval.service /etc/service/dataset_eval/run +RUN touch /etc/service/dataset_eval/down + +# Add cron jobs +COPY docker/crontab /etc/cron.d/acousticbrainz +RUN chmod 0644 /etc/cron.d/acousticbrainz +RUN touch /etc/service/cron/down + +COPY ./docker/rc.local /etc/rc.local + +COPY --chown=acousticbrainz:acousticbrainz package.json /code + +USER acousticbrainz +RUN npm install + +COPY --chown=acousticbrainz:acousticbrainz . /code + +RUN npm run build:prod + +# Our entrypoint runs as root +USER root diff --git a/models/sklearn/README.md b/models/sklearn/README.md new file mode 100644 index 000000000..b5b4ae443 --- /dev/null +++ b/models/sklearn/README.md @@ -0,0 +1,232 @@ +# Machine Learning Infrastructure with scikit-learn (GSoC 2020) + +This repository contains the tool that is built for training SVM models of +AcousticBrainz's datasets, as well as predicting where a single AcousticBrainz +track instance can be classified based on the trained models. It is part of the +*Google Summer of Code 2020* in collaboration with the **MetaBrainz** Open-Source +organization. + +Given a dataset, a Grid Search algorithm using n-fold cross-validation is executed +for an exhaustive search over specified parameter values for an estimator. + +A final model is trained with all the data (without a validation set) featuring +the best parameter combination in terms of accuracy. + +Finally, a prediction functionality is part of the tool, which gives the user the +capability of predicting where a track instance is classified based on a trained model. + + +## Functionalities + +### Train +The main model training function is the `create_classification_project` which is located in +the `model.classification_project.py` Python script. It can be imported as a module. +It requires a path to the dataset directory that contains sub-folders +composed of the groundtruth yaml file/s (tracks, tracks paths, labels, target class), and +the features (low-level data) in JSON format. + +``` +create_classification_project() + +Generates a model trained using descriptor files specified in the groundtruth yaml file. + +positional parameters: +groundtruth Path of the main dataset directory containing the + groundtruth yaml file/s. (required) + +file Name of the project configuration file (.yaml) will be stored. + If not specified it takes automatically the name ." + +exportsdir Name of the exports directory that the project's results + will be stored (best model, grid models, transformation + pipelines, folded and shuffled dataset). + +path Path where the project results will be stored. If empty, + the results will be saved in the main app directory. + +optional parameters: + +logging The logging level (int) that will be printed (0: DEBUG, 1: INFO, + 2: WARNING, 3: ERROR, 4: CRITICAL). Can be set only in the + prescribed integer values (0, 1, 2, 3, 4) + +seed Seed (int) is used to generate the random shuffled dataset + applied later to folding. If no seed is specified, the seed + will be automatically set to current clock value. + +jobs Parallel jobs (int). Set a value of cores to be used. + The default is -1, which means that all the available cores + will be used. + +verbose Controls the verbosity (int) of the Grid Search print messages + on the console: the higher, the more messages. +``` + +For example, a path directory structure could be like this one: + + dataset (e.g. danceability) + |- features + |  |-happy + |  |  |- 1.json + |  |  |- 2.json + |  |  |- 3.json + |  |  |- 4.json + |  |-sad + |  |  |- 1.json + |  |  |- 2.json + |  |  |- 3.json + |- metadata + |  |- groundtruth.yaml + +The tool will train a model with 2 classes (happy, sad), with 4 and 3 files in each class, respectively. + +The tool generates a `.yaml` project file to the path and exports directory specified or by the +arguments or automatically by the tool itself. This project file contains information about the +preprocessing steps that are followed through the training process, as well as the path and directory +where the results after the model training will be stored to. + + +### How the Training mode works + +There are several steps which are followed in the training phase. First of all, the project +configuration template file is loaded. Then, based on the arguments that are specified via the +`create_classification_project` function invoke, the`ListGroundTruthFiles` class searches for +the available `.yaml` file/s which contain the target class and the *groundtruth* data. These files +are inside the specified dataset directory. + +Afterwards, for each target class, the following actions take place inside the +`train_class` function: + +1. It starts with the `GroundTruthLoad` class that loads the *groundtruth* data from the related `.yaml` file. By + using its included methods, the tracks with their labels shuffled, in tuples, are exported as well as the + target class exploiting the `export_gt_tracks()` and the `export_train_class()` accordingly. The shuffled + dataset is also exported and saved locally in `.csv` format. A logger object is also set up and the logging + results are exported into the relevant `.log` file. + +2. It creates a project configuration file based on the specified paths for the exported results, as well as + a relevant directory that these results will be stored to. The training model results comprise: + +3. The `DatasetExporter` class is used then to load the tracks' features and exports them in a `pandas DataFrame`. + The tracks and the labels are also exported in separate `NumPy arrays` too. + +4. The `ClassificationTaskManager` class is invoked which is used for extracting the different classification tasks + that are specified in the configuration file. This is done be calling the `TrainingProcesses` class, which reads + the configuration file, and extracts the available training processes in a list. Each item of the list is + composed of a Python dictionary that comprises the evaluation that will take place with its: a) the classifier used, + b) the preprocess steps (features selection, scaling type, etc.), the k-fold cross-validation (number of folds), + and finally, c) the combination parameters that a Grid Search algorithm will use to find the best model that will + be assigned to the classifier. + +5. For each evaluation, the `ClassificationTask` class is used. The class loads the list of process dictionaries, with + their corresponding training steps as described above that contain also the features with their labels, as well as + the specified in the configuration file classifier that will be used for training the model. + +6. The whole specified classification task (i.e. the preprocessing, the training of the model for the selected + features, and the evaluation) takes place inside the `ClassificationTask` class. The `TrainGridClassifier` is + responsible for the classifier training by using a Grid Search algorithm which, in our case loads a + `Support Vector Machines` Machine Learning model from sklearn with a grid of parameters. + +7. For each preprocessing step, the `Transform` class is responsible for doing the appropriate preprocess, like the + data cleaning, the features selection, the enumeration, and the scaling, when it is available. For each + preprocessing step, the corresponding transformation pipeline (in `.pkl` format) is extracted and saved locally + for later use in the predictions mode. + +8. The transformed features data is loaded then to the `train_grid` function where the training of the model takes place. + The results of the training phase are extracted by using the `save_grid_results` function. Such results are the best + parameters that did best in each training phase (i.e. in each training step), as well as the best model from this + training step which is saved locally in `.pkl` format. Finally, the best extracted + models from each training process are compared and the best one is chosen. The information about the best model + parameters, with the preprocess step that was followed are exported and saved in a `.json` file locally, and + include: + * Best model's score, the parameters, the preprocess (data cleaning, features selection, enumeration, scaling), + and the number of folds that the dataset was split into through the cross-validation training procedure. + +9. The `evaluation` function is used to evaluate the best model and the relevant reports are + exported. The best model and the corresponding preprocessing step pipeline are loaded, and a k-fold + cross-validation training takes place. The results from this process are: + * A `yaml` file that contains the tracks' instances and the fold that were classified is exported in this phase. + * A `.csv` file that includes the tracks, the prediction that took place in the relevant fold, the true label, + and the probability of the classifier's decision function that took for each class prediction. + * The plot that depicts the accuracy score delivered from each fold training. + * A `.txt` file that contains detailed information about each fold's training score, the *mean* of all the + accuracies exported from each fold, as well as the *standard deviation* of these accuracies. + * The `.txt` files that contain the confusion matrix and the classification report of the cross-validation + training. + +10. Finally, the `evaluation` function executes a training to the whole dataset by using the best model that is + extracted from the grid search algorithm. After applying predictions to the whole dataset, the related `.txt` + files with the confusion matrix and the classification report are exported and saved locally to the disk. The + trained model, after this training phase is saved locally in `.pkl` format for later use from the + predictions mode of the tool. + + + +### Predict + +The `model.predict.py` script contains the `prediction` function. This function can be invoked via by +importing the function in a separate script and invoking it with its corresponding parameters. The +project `.yaml` file with project's configuration metadata is a required field in the function's +parameters, as well as the **MBID** of the track to be called for predicting to which trained model's +class will be classified. The MBID is actually the Musicbrainz ID which is the unique track's ID +stored in the MusicBrainz and AcousticBrainz database. For example, the following link: +* https://acousticbrainz.org/232b8e6e-0aa5-4310-8df3-583047af3126 +has the MBID: `232b8e6e-0aa5-4310-8df3-583047af3126` + +This is the only necessary information for the related argument of the `prediction` function to +make the relevant classification. + +``` +$ python predict.py --help +usage: predict.py [-h] [--path] [--file] [--track] [--logging] + +positional arguments: +path Path where the project file (.yaml) is stored (required). + +file Name of the project configuration file (.yaml) that + is to be loaded. (required) + The .yaml at the end of the file is not necessary. + Just put the name of the file. + +track MBID of the the low-level data from the AcousticBrainz API. + (required) + +optional arguments: + +logging The logging level (int) that will be printed (0: DEBUG, 1: INFO, + 2: WARNING, 3: ERROR, 4: CRITICAL). Can be set only in the + prescribed integer values (0, 1, 2, 3, 4) +``` + +### How the Predictions mode works + +The function and the class that are used in this phase are the `prediction` and the `Predict` accordingly. The steps +that are followed in this mode are: + +1. The `prediction` function loads the project configuration file that was created by the training of the + corresponding model. This `.yaml` file includes all the relevant information about the paths that the + trained model and the preprocessing pipelines were saved to (in `.pkl` format). + +2. Then, by using the MBID that was inserted as an argument, it downloads the low-level data from AcousticBrainz API, + using the `requests` library. + +3. The data, which are in JSON format are then loaded to the `Predict` class, with the built model's configuration + data (training results' location, etc.). + +3. The `Predict` loads the best model's JSON file that was saved from the training mode, and checks the preprocessing + step that resulted in the best model. + +4. After checking which was the preprocessing step that was specified inside the best model's metadata, the + `TransformPredictions` class is invoked and does the necessary data transformation by loading the corresponding + preprocessing pipeline that was saved in `.pkl` format during the training mode. + +5. After that, it loads the best trained model that was saved in `.pkl` format. + +6. It does the prediction. + +7. It returns a dictionary that includes: + * the predicted class + * the score of the predicted class + * the probabilities for each class the model took to decide to which one the track will be classified. + + + diff --git a/models/sklearn/__init__.py b/models/sklearn/__init__.py index 7c68785e9..40a96afc6 100644 --- a/models/sklearn/__init__.py +++ b/models/sklearn/__init__.py @@ -1 +1 @@ -# -*- coding: utf-8 -*- \ No newline at end of file +# -*- coding: utf-8 -*- diff --git a/models/sklearn/classification/__init__.py b/models/sklearn/classification/__init__.py index 7c68785e9..40a96afc6 100644 --- a/models/sklearn/classification/__init__.py +++ b/models/sklearn/classification/__init__.py @@ -1 +1 @@ -# -*- coding: utf-8 -*- \ No newline at end of file +# -*- coding: utf-8 -*- diff --git a/models/sklearn/classification/classification_task.py b/models/sklearn/classification/classification_task.py index 229860878..660f3e87c 100644 --- a/models/sklearn/classification/classification_task.py +++ b/models/sklearn/classification/classification_task.py @@ -1,14 +1,35 @@ import os -from classification.classifierGRID import TrainGridClassifier import json -from termcolor import colored -from classification.classifierBASIC import TrainClassifier -from classification.evaluation import fold_evaluation -from logging_tool import LoggerSetup +from ..classification.classifier_grid import TrainGridClassifier +from ..classification.evaluation import evaluation +from ..helper_functions.logging_tool import LoggerSetup class ClassificationTask: + """ + This class is the core of the model classification. It loads the relevant classifier to + be used for training, the features, the labels, and the tracks. It uses a corresponding + to the configuration file declared class to train the model and then it uses that model + for evaluation. + """ def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, log_level): + """ + Args: + config: The configuration data that contain the settings from the configuration + template with the parsed arguments in classification project. + classifier: The classifier name (e.g. svm) that is declared in the classifiers + list of the configuration data. + train_class: The class name that is defined in the groundtruth yaml file. It is + actually the model that will be trained. + training_processes: The training processes (list) where each item of the list + contains the set of parameters that will be used in the classifier: + (Evaluation, classifier, preprocess, kernel, C, gamma, balanceClasses, n_fold) + X: The features (pandas DataFrame) of the exported data from the DatasetExporter class + y: The labels (NumPy array) of the target class + exports_path: Path to where the classification project's results will be stored to. + tracks: The tracks (numpy.ndarray) that are exported from the Groundtruth file. + log_level: The logging level (0-4). + """ self.config = config self.classifier = classifier self.train_class = train_class @@ -24,10 +45,9 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex self.setting_logger() def setting_logger(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, - name="train_class_{}".format(self.train_class), + name="train_model_{}".format(self.train_class), train_class=self.train_class, mode="a", level=self.log_level).setup_logger() @@ -47,30 +67,25 @@ def run(self): ) grid_svm_train.train_grid_search_clf() grid_svm_train.export_best_classifier() - elif self.classifier == "NN": - self.logger.info("Train Classifier: Neural Networks") - pass + else: + self.logger.error("Use a valid classifier in the configuration file.") + self.logger.info("Training the classifier is completed successfully.") - self.logger.info("Training is completed successfully..") - - # load best model - self.logger.info("Loading Best Model..") - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + # load best model to check its parameters + self.logger.debug("Loading the Best Model..") + exports_dir = self.config.get("exports_directory") best_model_name = "best_model_{}.json".format(self.train_class) with open(os.path.join(self.exports_path, exports_dir, best_model_name)) as best_model_file: best_model = json.load(best_model_file) - print(colored("BEST MODEL:", "cyan")) - print(best_model) - self.logger.info("Best Model loaded successfully.") + self.logger.debug("BEST MODEL: {}".format(best_model)) - # clf_model = TrainClassifier(classifier=self.classifier, params=best_model["params"]).model() - print("Best model loaded..") - fold_evaluation(config=self.config, - n_fold=best_model["n_fold"], - X=self.X, y=self.y, - class_name=self.train_class, - tracks=self.tracks, - process=best_model["preprocessing"], - exports_path=self.exports_path, - log_level=self.log_level - ) + # evaluation + evaluation(config=self.config, + n_fold=best_model["n_fold"], + X=self.X, y=self.y, + class_name=self.train_class, + tracks=self.tracks, + process=best_model["preprocessing"], + exports_path=self.exports_path, + log_level=self.log_level + ) diff --git a/models/sklearn/classification/classification_task_manager.py b/models/sklearn/classification/classification_task_manager.py index b64a84025..935bce944 100644 --- a/models/sklearn/classification/classification_task_manager.py +++ b/models/sklearn/classification/classification_task_manager.py @@ -1,27 +1,33 @@ import os from time import time from termcolor import colored -from utils import load_yaml, FindCreateDirectory, TrainingProcesses -from classification.classification_task import ClassificationTask from datetime import datetime -from logging_tool import LoggerSetup +from ..helper_functions.utils import FindCreateDirectory, TrainingProcesses +from ..classification.classification_task import ClassificationTask +from ..helper_functions.logging_tool import LoggerSetup -validClassifiers = ['NN', 'svm'] -validEvaluations = ['nfoldcrossvalidation'] + +validClassifiers = ["svm", "NN"] +validEvaluations = ["nfoldcrossvalidation"] class ClassificationTaskManager: """ - + It manages the tasks to be done based on the configuration file. It checks if the + config keys exist in the template and are specified correctly, as well as it creates + the relevant directories (if not exist) where the classification results will be + stored to. Then, it extracts a list with the evaluation steps that will be followed + with their corresponding preprocessing steps and parameters declaration for the + classifier, and executes the classification task for each step. """ def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): """ - - :param yaml_file: The configuration file name - :param train_class: The class that will be trained - :param X: The already shuffled data that contain the features - :param y: The already shuffled data that contain the labels + Args: + config: The configuration file name. + train_class: The class that will be trained. + X: The already shuffled data that contain the features. + y: The already shuffled data that contain the labels. """ self.config = config self.train_class = train_class @@ -48,18 +54,17 @@ def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): def setting_logger(self): self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, - name="train_class_{}".format(self.train_class), + name="train_model_{}".format(self.train_class), train_class=self.train_class, mode="a", level=self.log_level).setup_logger() def files_existence(self): """ - Ensure that all the folders will exist before the training process starts - :return: + Ensure that all the folders will exist before the training process starts. """ # main exports - self.exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + self.exports_dir = self.config.get("exports_directory") # train results exports self.results_path = FindCreateDirectory(self.exports_path, os.path.join(self.exports_dir, "results")).inspect_directory() @@ -83,27 +88,33 @@ def files_existence(self): os.path.join(self.exports_dir, "reports")).inspect_directory() def config_file_analysis(self): + """ + Check the keys of the configuration template file if they are set up correctly. + """ self.logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") - if 'processing' not in self.config: - self.logger.error('No preprocessing defined in config.') + if "processing" not in self.config: + self.logger.error("No preprocessing defined in config.") - if 'evaluations' not in self.config: - self.logger.error('No evaluations defined in config.') - self.logger.error('Setting default evaluation to 10-fold cross-validation') - self.config['evaluations'] = {'nfoldcrossvalidation': [{'nfold': [10]}]} + if "evaluations" not in self.config: + self.logger.error("No evaluations defined in config.") + self.logger.error("Setting default evaluation to 10-fold cross-validation") + self.config["evaluations"] = {"nfoldcrossvalidation": [{"nfold": [10]}]} for classifier in self.config['classifiers'].keys(): if classifier not in validClassifiers: - self.logger.error('Not a valid classifier: {}'.format(classifier)) - raise ValueError('The classifier name must be valid.') + self.logger.error("Not a valid classifier: {}".format(classifier)) + raise ValueError("The classifier name must be valid.") for evaluation in self.config['evaluations'].keys(): if evaluation not in validEvaluations: - self.logger.error('Not a valid evaluation: {}'.format(evaluation)) + self.logger.error("Not a valid evaluation: {}".format(evaluation)) raise ValueError("The evaluation must be valid.") self.logger.info("No errors in config file format found.") def apply_processing(self): + """ + Evaluation steps extraction and classification task execution for each step. + """ start_time = time() training_processes = TrainingProcesses(self.config).training_processes() self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) diff --git a/models/sklearn/classification/classifierGRID.py b/models/sklearn/classification/classifierGRID.py deleted file mode 100644 index b8556a75f..000000000 --- a/models/sklearn/classification/classifierGRID.py +++ /dev/null @@ -1,153 +0,0 @@ -import os -import json -import math -from pprint import pprint -from termcolor import colored -import joblib -from sklearn.model_selection import GridSearchCV -from sklearn.svm import SVC -from sklearn.model_selection import KFold - -from transformation.transform import Transform -from utils import load_yaml, FindCreateDirectory, TrainingProcesses -from logging_tool import LoggerSetup - - -class TrainGridClassifier: - def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, log_level): - self.config = config - self.classifier = classifier - self.class_name = class_name - self.X = X - self.y = y - self.tr_processes = tr_processes - self.exports_path = exports_path - self.log_level = log_level - - self.logger = "" - self.best_models_list = [] - # self.train_grid_search_clf() - - self.setting_logger() - - def setting_logger(self): - # set up logger - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_class_{}".format(self.class_name), - train_class=self.class_name, - mode="a", - level=self.log_level).setup_logger() - - def train_grid_search_clf(self): - process_counter = 1 - for tr_process in self.tr_processes: - print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) - self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) - # initiate SVM classifier object - if self.classifier == "svm": - grid_clf = SVC(gamma="auto", probability=True) - # TODO: different classifier object (e.g. random forests, knn, etc) can be initiated here - else: - raise ValueError('The classifier name must be valid.') - - print("CLASSIFIER", tr_process["classifier"]) - # transformation of the data - features_prepared = Transform(config=self.config, - df_feats=self.X, - process=tr_process["preprocess"], - train_class=self.class_name, - exports_path=self.exports_path, - log_level=self.log_level).post_processing() - - # define the length of parameters - parameters_grid = {'kernel': tr_process["kernel"], - 'C': tr_process["C"], - 'gamma': tr_process["gamma"], - 'class_weight': tr_process["balanceClasses"] - } - - # inner with K-Fold cross-validation declaration - random_seed = None - shuffle = self.config["k_fold_shuffle"] - if shuffle is True: - random_seed = self.config["seed"] - elif shuffle is False: - random_seed = None - self.logger.info("Fitting the data to the classifier with K-Fold cross-validation..") - inner_cv = KFold(n_splits=tr_process["n_fold"], - shuffle=shuffle, - random_state=random_seed - ) - # initiate GridSearch Object - gsvc = GridSearchCV(estimator=grid_clf, - param_grid=parameters_grid, - cv=inner_cv, - n_jobs=self.config["parallel_jobs"], - verbose=self.config["verbose"] - ) - - self.logger.debug("Shape of X before train: {}".format(features_prepared.shape)) - self.logger.info("Fitting the data to the model..") - gsvc.fit(features_prepared, self.y) - - # print(gsvc.cv_results_["params"]) - self.logger.info("Results from each best preprocess training:") - self.logger.info("a) Best score: {}".format(gsvc.best_score_)) - self.logger.info("b) Best estimator: {}".format(gsvc.best_estimator_)) - self.logger.info("c) Best parameters: {}".format(gsvc.best_params_)) - self.logger.info("Counted evaluations in this GridSearch process: {}".format(len(gsvc.cv_results_["params"]))) - - # save best results for each train process - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.class_name) - results_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "results")).inspect_directory() - results_best_dict_name = "result_{}_{}_best_{}.json"\ - .format(self.class_name, tr_process["preprocess"], gsvc.best_score_) - - results_dict = dict() - results_dict["score"] = gsvc.best_score_ - results_dict["params"] = gsvc.best_params_ - results_dict["n_fold"] = tr_process['n_fold'] - results_dict["preprocessing"] = tr_process["preprocess"] - with open(os.path.join(results_path, results_best_dict_name), 'w') as grid_best_json: - json.dump(results_dict, grid_best_json, indent=4) - - # export parameters that the - results_params_dict_name = "result_{}_{}_params_{}.json"\ - .format(self.class_name, tr_process["preprocess"], gsvc.best_score_) - with open(os.path.join(results_path, results_params_dict_name), 'w') as grid_params_json: - json.dump(gsvc.cv_results_["params"], grid_params_json, indent=0) - - models_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "models")).inspect_directory() - best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) - joblib.dump(gsvc.best_estimator_, best_process_model_path) - self.logger.info("Grid Best model for the {} process saved.".format(tr_process["preprocess"])) - - # return a list that includes the best models exported from each processing - self.best_models_list.append(results_dict) - - print(colored("Next train process..", "yellow")) - process_counter += 1 - print() - print() - print(colored("Finishing training processes..", "blue")) - print() - - def export_best_classifier(self): - # gather best scores from the exported grid clf models - scores = [x["score"] for x in self.best_models_list] - self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) - for model in self.best_models_list: - if model["score"] == max(scores): - self.logger.info("Best {} model parameters:".format(self.class_name)) - # log2 --> convert values to initial parameters' values - # model["params"]["C"] = math.log2(model["params"]["C"]) - # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) - self.logger.info("{}".format(model)) - best_model_name = "best_model_{}.json".format(self.class_name) - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.class_name) - with open(os.path.join(self.exports_path, exports_dir, best_model_name), "w") as best_model: - json.dump(model, best_model, indent=4) - self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) diff --git a/models/sklearn/classification/classifierBASIC.py b/models/sklearn/classification/classifier_basic.py similarity index 53% rename from models/sklearn/classification/classifierBASIC.py rename to models/sklearn/classification/classifier_basic.py index 1edeee76f..86e7b9b24 100644 --- a/models/sklearn/classification/classifierBASIC.py +++ b/models/sklearn/classification/classifier_basic.py @@ -2,14 +2,28 @@ class TrainClassifier: + """ + This class initiates a simple classifier. It is used for initiating a simple model from + sklearn or other APIs in the future, like TensorFlow. + TODO: Initiating other models from sklearn (e.g. Random Forests, Decision Tree, etc.) + """ def __init__(self, classifier, params): + """ + Args: + classifier: the classifier name (str) to be set. A string that is among the valid + classifiers list. + params: The parameters of the classifier (dictionary). + + Returns: + The model object that is initiated (including its set of parameters) + """ self.classifier = classifier self.params = params def model(self): - validClassifiers = ['NN', 'svm'] + validClassifiers = ["NN", "svm", "rf"] if self.classifier not in validClassifiers: - raise ValueError('The classifier name must be valid.') + raise ValueError("The classifier name must be valid.") if self.classifier == "svm": param_C = self.params["C"] diff --git a/models/sklearn/classification/classifier_grid.py b/models/sklearn/classification/classifier_grid.py new file mode 100644 index 000000000..3821533df --- /dev/null +++ b/models/sklearn/classification/classifier_grid.py @@ -0,0 +1,177 @@ +import os +import json +from termcolor import colored +import joblib +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +from sklearn.model_selection import KFold + +from ..transformation.transform import Transform +from ..helper_functions.utils import FindCreateDirectory +from ..helper_functions.logging_tool import LoggerSetup + + +class TrainGridClassifier: + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, log_level): + self.config = config + self.classifier = classifier + self.class_name = class_name + self.X = X + self.y = y + self.tr_processes = tr_processes + self.exports_path = exports_path + self.log_level = log_level + + self.logger = "" + self.best_models_list = [] + # self.train_grid_search_clf() + + self.setting_logger() + + def setting_logger(self): + self.logger = LoggerSetup(config=self.config, + exports_path=self.exports_path, + name="train_model_{}".format(self.class_name), + train_class=self.class_name, + mode="a", + level=self.log_level).setup_logger() + + def train_grid_search_clf(self): + process_counter = 1 + for tr_process in self.tr_processes: + print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) + self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) + # initiate SVM classifier object + if self.classifier == "svm": + grid_clf = SVC(gamma="auto", probability=True) + # TODO: different classifier object (e.g. random forests, knn, etc) can be initiated here + else: + raise ValueError('The classifier name must be valid.') + + print("CLASSIFIER", tr_process["classifier"]) + # transformation of the data + features_prepared = Transform(config=self.config, + df_feats=self.X, + process=tr_process["preprocess"], + train_class=self.class_name, + exports_path=self.exports_path, + log_level=self.log_level).post_processing() + + # train the grid classifier and return the trained model + gsvc = train_grid(tr_process=tr_process, + grid_clf=grid_clf, + features_prepared=features_prepared, + y=self.y, + config=self.config, + logger=self.logger) + + # save best results for each train process + exports_dir = self.config.get("exports_directory") + # paths declaration for saving the grid training results + results_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "results")).inspect_directory() + models_path = FindCreateDirectory(self.exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) + + # save the results from each train process step and return the results from that train in a dictionary + # that contains: the best score, the best params, the number of folds, and the preprocessing step + results_dict = save_grid_results(gsvc=gsvc, + class_name=self.class_name, + tr_process=tr_process, + results_path=results_path, + best_process_model_path=best_process_model_path, + logger=self.logger) + + # return a list that includes the best models exported from each processing + self.best_models_list.append(results_dict) + + print(colored("Next train process..", "yellow")) + process_counter += 1 + print() + print() + print(colored("Finishing training processes..", "blue")) + print() + + def export_best_classifier(self): + # Gather the best scores from the exported grid clf models + scores = [x["score"] for x in self.best_models_list] + self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) + for model in self.best_models_list: + if model["score"] == max(scores): + self.logger.info("Best {} model parameters:".format(self.class_name)) + # log2 --> convert values to initial parameters' values + # model["params"]["C"] = math.log2(model["params"]["C"]) + # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) + self.logger.info("{}".format(model)) + best_model_name = "best_model_{}.json".format(self.class_name) + exports_dir = self.config.get("exports_directory") + with open(os.path.join(self.exports_path, exports_dir, best_model_name), "w") as best_model: + json.dump(model, best_model, indent=4) + self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) + + +def train_grid(tr_process, grid_clf, features_prepared, y, config, logger): + # define the length of parameters + parameters_grid = {'kernel': tr_process["kernel"], + 'C': tr_process["C"], + 'gamma': tr_process["gamma"], + 'class_weight': tr_process["balance_classes"] + } + + # inner with K-Fold cross-validation declaration + random_seed = None + shuffle = config["k_fold_shuffle"] + if shuffle is True: + random_seed = config["seed"] + elif shuffle is False: + random_seed = None + logger.info("Fitting the data to the classifier with K-Fold cross-validation..") + inner_cv = KFold(n_splits=tr_process["n_fold"], + shuffle=shuffle, + random_state=random_seed + ) + # initiate GridSearch Object + gsvc = GridSearchCV(estimator=grid_clf, + param_grid=parameters_grid, + cv=inner_cv, + n_jobs=config["parallel_jobs"], + verbose=config["verbose"] + ) + + logger.debug("Shape of X before train: {}".format(features_prepared.shape)) + logger.info("Fitting the data to the model..") + gsvc.fit(features_prepared, y) + + logger.info("Results from each best preprocess training:") + logger.info("a) Best score: {}".format(gsvc.best_score_)) + logger.info("b) Best estimator: {}".format(gsvc.best_estimator_)) + logger.info("c) Best parameters: {}".format(gsvc.best_params_)) + logger.info("Counted evaluations in this GridSearch process: {}".format(len(gsvc.cv_results_["params"]))) + + return gsvc + + +def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path, logger): + results_best_dict_name = "result_{}_{}_best_{}.json" \ + .format(class_name, tr_process["preprocess"], gsvc.best_score_) + + results_dict = { + "score": gsvc.best_score_, + "params": gsvc.best_params_, + "n_fold": tr_process['n_fold'], + "preprocessing": tr_process["preprocess"] + } + with open(os.path.join(results_path, results_best_dict_name), 'w') as grid_best_json: + json.dump(results_dict, grid_best_json, indent=4) + + # export the parameters that the best model has from each training step + results_params_dict_name = "result_{}_{}_params_{}.json" \ + .format(class_name, tr_process["preprocess"], gsvc.best_score_) + with open(os.path.join(results_path, results_params_dict_name), 'w') as grid_params_json: + json.dump(gsvc.cv_results_["params"], grid_params_json, indent=0) + + joblib.dump(gsvc.best_estimator_, best_process_model_path) + logger.info("Grid Best model for the {} process saved.".format(tr_process["preprocess"])) + + return results_dict diff --git a/models/sklearn/classification/confusion_matrix.py b/models/sklearn/classification/confusion_matrix.py deleted file mode 100644 index a7af4f4ae..000000000 --- a/models/sklearn/classification/confusion_matrix.py +++ /dev/null @@ -1,20 +0,0 @@ -class ConfusionMatrix: - - def __init__(self, matrix, classes): - self.matrix = matrix - self.classes = classes - - def toHtml(self): - html = '' - html += '' - html += '' - html += '' - html += '' - html += '' - html += ' + ); From d4d31ef4d5f80019e13825257b586e3e9ea9a75f Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 15:47:03 +0530 Subject: [PATCH 42/64] Try fixing react styling error --- webserver/static/scripts/datasets/eval-jobs-viewer.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webserver/static/scripts/datasets/eval-jobs-viewer.js b/webserver/static/scripts/datasets/eval-jobs-viewer.js index 03411d5c8..574a4b6f3 100644 --- a/webserver/static/scripts/datasets/eval-jobs-viewer.js +++ b/webserver/static/scripts/datasets/eval-jobs-viewer.js @@ -285,7 +285,7 @@ class JobRow extends React.Component { - + ); From d4982bb3bd6703902cfeb9fbfe12ebd13f473d68 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 15:51:52 +0530 Subject: [PATCH 43/64] Add missing table header --- webserver/static/scripts/datasets/eval-jobs-viewer.js | 1 + 1 file changed, 1 insertion(+) diff --git a/webserver/static/scripts/datasets/eval-jobs-viewer.js b/webserver/static/scripts/datasets/eval-jobs-viewer.js index 574a4b6f3..2932ee61a 100644 --- a/webserver/static/scripts/datasets/eval-jobs-viewer.js +++ b/webserver/static/scripts/datasets/eval-jobs-viewer.js @@ -207,6 +207,7 @@ class JobList extends React.Component { + From 680b509bd026370c7be05ec85065f28e3d19fd35 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 19:29:13 +0530 Subject: [PATCH 44/64] Use eval_location as ground truth directory --- .../sklearn/model/classification_project.py | 15 +++-------- .../transformation/load_ground_truth.py | 27 ------------------- dataset_eval/evaluate.py | 2 +- 3 files changed, 5 insertions(+), 39 deletions(-) diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 340b077e6..1bbb3a5b9 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -2,7 +2,6 @@ import argparse from ..helper_functions.utils import load_yaml import time -from ..transformation.load_ground_truth import ListGroundTruthFiles from ..classification.train_class import train_class @@ -67,15 +66,10 @@ def create_classification_project(ground_truth_directory, project_file=None, exp print() print() print("-------------------------------------------------------") - # print("AFTER:") - # pprint(project_template) - gt_files_list = ListGroundTruthFiles(project_template).list_gt_filenames() - print("List GroundTruth yaml files found:") - print(gt_files_list) - print("LOAD GROUND TRUTH") - for gt_file in gt_files_list: - train_class(project_template, gt_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) + ground_truth_file = os.path.join(ground_truth_directory, "groundtruth.yaml") + print("Loading GroundTruth yaml file:", ground_truth_file) + train_class(project_template, ground_truth_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) if __name__ == '__main__': @@ -88,8 +82,7 @@ def create_classification_project(ground_truth_directory, project_file=None, exp parser.add_argument("-g", "--groundtruth", dest="ground_truth_directory", - default="datasets", - help="Path of the main dataset directory containing the groundtruth file/s.", + help="Path of the dataset directory containing the groundtruth file/s.", required=True) parser.add_argument("-f", "--file", diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index c26b596d1..3ab179f45 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -9,33 +9,6 @@ from ..helper_functions.logging_tool import LoggerSetup -class ListGroundTruthFiles: - """ - Lists the groundtruth yaml files that are detected in a folder specified in - the configuration file. The yaml files contain the target class and the tracks - to be analyzed. - """ - def __init__(self, config): - """ - Args: - config: The configuration data - """ - self.config = config - self.dataset_dir = "" - - def list_gt_filenames(self): - """ - Returns: - A list of the groundtruth detected yaml files. - """ - self.dataset_dir = self.config.get("ground_truth_directory") - ground_truth_list = list() - dirpath = os.path.join(os.getcwd(), self.dataset_dir) - for (dirpath, dirnames, filenames) in os.walk(dirpath): - ground_truth_list += [os.path.join(dirpath, file) for file in filenames if file.startswith("groundtruth")] - return ground_truth_list - - class GroundTruthLoad: """ The Ground Truth data which contains the tracks and the corresponding diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index eed1f0c42..94f57190b 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -126,7 +126,7 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job): - create_classification_project(ground_truth_directory=dataset_dir, + create_classification_project(ground_truth_directory=eval_location, project_file=eval_job["id"], exports_directory=eval_job["id"], exports_path=eval_location, From 54e61e90dbcf62940a0c1a46fb0d9fd92cc51394 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 20:12:00 +0530 Subject: [PATCH 45/64] Try passing eval_job_id separately --- acousticbrainz/models/sklearn/model/classification_project.py | 4 ++-- dataset_eval/evaluate.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 1bbb3a5b9..204c11ac4 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -5,7 +5,7 @@ from ..classification.train_class import train_class -def create_classification_project(ground_truth_directory, project_file=None, exports_directory=None, exports_path=None, +def create_classification_project(ground_truth_directory, eval_job_id=None, project_file=None, exports_directory=None, exports_path=None, c_values=None, gamma_values=None, preprocessing_values=None, seed=None, jobs=-1, verbose=1, logging="logging.INFO"): """ @@ -67,7 +67,7 @@ def create_classification_project(ground_truth_directory, project_file=None, exp print() print("-------------------------------------------------------") - ground_truth_file = os.path.join(ground_truth_directory, "groundtruth.yaml") + ground_truth_file = os.path.join(ground_truth_directory, eval_job_id, "groundtruth.yaml") print("Loading GroundTruth yaml file:", ground_truth_file) train_class(project_template, ground_truth_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 94f57190b..070eddab3 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -126,7 +126,8 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job): - create_classification_project(ground_truth_directory=eval_location, + create_classification_project(ground_truth_directory=dataset_dir, + eval_job_id=eval_job["id"], project_file=eval_job["id"], exports_directory=eval_job["id"], exports_path=eval_location, From 836d2eab0a57b71bf7d0fa3dd10ce231bc8124c1 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 22:03:46 +0530 Subject: [PATCH 46/64] Separate datasets dir from groundtruth file path --- .../sklearn/model/classification_project.py | 24 ++++++++++++------- .../transformation/load_ground_truth.py | 5 +--- dataset_eval/evaluate.py | 7 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 204c11ac4..5cc45a851 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -5,13 +5,13 @@ from ..classification.train_class import train_class -def create_classification_project(ground_truth_directory, eval_job_id=None, project_file=None, exports_directory=None, exports_path=None, +def create_classification_project(ground_truth_file, dataset_dir, project_file=None, exports_directory=None, exports_path=None, c_values=None, gamma_values=None, preprocessing_values=None, seed=None, jobs=-1, verbose=1, logging="logging.INFO"): """ Args: - ground_truth_directory: The path (str) to the dataset directory where the - groundtruth yaml file is located. It is required. + ground_truth_file: The path (str) to the groundtruth yaml file of the dataset. It is required. + dataset_dir: The path to main datasets_dir containing the .json files. project_file: The name (str) of the project configuration yaml file that will be created. Default: None. If None, the tool will create automatically a project file name in form of "project_CLASS_NAME", @@ -47,7 +47,8 @@ def create_classification_project(ground_truth_directory, eval_job_id=None, proj print("Seed argument: {}".format(seed)) - project_template["ground_truth_directory"] = ground_truth_directory + project_template["ground_truth_file"] = ground_truth_file + project_template["dataset_dir"] = datasets_dir project_template["project_file"] = project_file project_template["logging_level"] = logging project_template["seed"] = seed @@ -67,7 +68,6 @@ def create_classification_project(ground_truth_directory, eval_job_id=None, proj print() print("-------------------------------------------------------") - ground_truth_file = os.path.join(ground_truth_directory, eval_job_id, "groundtruth.yaml") print("Loading GroundTruth yaml file:", ground_truth_file) train_class(project_template, ground_truth_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) @@ -81,8 +81,13 @@ def create_classification_project(ground_truth_directory, eval_job_id=None, proj 'essentia version found on the descriptor files.') parser.add_argument("-g", "--groundtruth", - dest="ground_truth_directory", - help="Path of the dataset directory containing the groundtruth file/s.", + dest="ground_truth_file", + help="Path of the dataset's groundtruth file/s.", + required=True) + + parser.add_argument("-d", "--datasetsdir", + dest="dataset_dir", + help="Path of the main datasets dir containing .json file/s.", required=True) parser.add_argument("-f", "--file", @@ -90,7 +95,7 @@ def create_classification_project(ground_truth_directory, eval_job_id=None, proj help="Name of the project configuration file (.yaml) will be stored. If not specified " "it takes automatically the name .") - parser.add_argument("-d", "--exportsdir", + parser.add_argument("-e", "--exportsdir", dest="exports_directory", help="Name of the exports directory that the project's results will be stored.") @@ -122,7 +127,8 @@ def create_classification_project(ground_truth_directory, eval_job_id=None, proj args = parser.parse_args() - create_classification_project(ground_truth_directory=args.ground_truth_directory, + create_classification_project(ground_truth_file=args.ground_truth_file, + dataset_dir=args.dataset_dir, project_file=args.project_file, exports_directory=args.exports_directory, exports_path=args.exports_path, diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 3ab179f45..dabc181b2 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -164,11 +164,8 @@ def create_df_tracks(self): """ self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") - # the class name from the ground truth data that is the target - self.dataset_dir = self.config.get("ground_truth_directory") - # self.class_dir = self.config.get("class_dir") + self.dataset_dir = self.config.get("dataset_dir") print('DATASET-DIR', self.dataset_dir) - # print('CLASS NAME PATH', self.class_dir) dirpath = os.path.join(os.getcwd(), self.dataset_dir) low_level_list = list() for (dirpath, dirnames, filenames) in os.walk(dirpath): diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 070eddab3..3db9f67ff 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -87,6 +87,7 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): logging.info("Training SKLEARN model...") evaluate_sklearn(options=eval_job["options"], eval_location=eval_location, + ground_truth_file=groundtruth_path, dataset_dir=dataset_dir, storage_dir=storage_dir, eval_job=eval_job) @@ -125,9 +126,9 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora })) -def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job): - create_classification_project(ground_truth_directory=dataset_dir, - eval_job_id=eval_job["id"], +def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, storage_dir, eval_job): + create_classification_project(ground_truth_file=ground_truth_file, + dataset_dir=dataset_dir, project_file=eval_job["id"], exports_directory=eval_job["id"], exports_path=eval_location, From dd390a82e871f95bdbf24beb991c8194f3eec4bd Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 22:06:31 +0530 Subject: [PATCH 47/64] fix typo --- acousticbrainz/models/sklearn/model/classification_project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 5cc45a851..0907436f6 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -48,7 +48,7 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N print("Seed argument: {}".format(seed)) project_template["ground_truth_file"] = ground_truth_file - project_template["dataset_dir"] = datasets_dir + project_template["dataset_dir"] = dataset_dir project_template["project_file"] = project_file project_template["logging_level"] = logging project_template["seed"] = seed From ab4cb530cebb763965b5b6918ca64fc5c77e62ae Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 22:42:17 +0530 Subject: [PATCH 48/64] Remove redundant exports_dir The export_path parameter should contain the complete path where the results should be stored. Everywhere we use exports_dir, we append it to the export_path. If its a matter of user convenience, we can add it back of the command line but merge it with export_path asap. --- .../classification/classification_task.py | 3 +- .../classification_task_manager.py | 23 ++++--------- .../sklearn/classification/classifier_grid.py | 10 ++---- .../sklearn/classification/evaluation.py | 33 +++++++------------ .../classification/report_files_export.py | 3 +- .../sklearn/classification/train_class.py | 9 +---- .../sklearn/helper_functions/logging_tool.py | 5 +-- .../sklearn/model/classification_project.py | 20 +++-------- .../models/sklearn/model/predict.py | 6 +--- .../transformation/load_ground_truth.py | 4 +-- .../sklearn/transformation/transform.py | 4 +-- .../transformation/transform_predictions.py | 4 +-- dataset_eval/evaluate.py | 9 ++--- 13 files changed, 36 insertions(+), 97 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py index 660f3e87c..87b845ab0 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task.py +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -73,9 +73,8 @@ def run(self): # load best model to check its parameters self.logger.debug("Loading the Best Model..") - exports_dir = self.config.get("exports_directory") best_model_name = "best_model_{}.json".format(self.train_class) - with open(os.path.join(self.exports_path, exports_dir, best_model_name)) as best_model_file: + with open(os.path.join(self.exports_path, best_model_name)) as best_model_file: best_model = json.load(best_model_file) self.logger.debug("BEST MODEL: {}".format(best_model)) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index 94f78ce02..34067608a 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -37,7 +37,6 @@ def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): self.exports_path = exports_path self.log_level = log_level - self.exports_dir = "" self.results_path = "" self.logs_path = "" self.tracks_path = "" @@ -64,28 +63,20 @@ def files_existence(self): Ensure that all the folders will exist before the training process starts. """ # main exports - self.exports_dir = self.config.get("exports_directory") # train results exports - self.results_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "results")).inspect_directory() + self.results_path = FindCreateDirectory(self.exports_path, "results").inspect_directory() # logs - self.logs_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "logs")).inspect_directory() + self.logs_path = FindCreateDirectory(self.exports_path, "logs").inspect_directory() # tracks - self.tracks_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "tracks_csv_format")).inspect_directory() + self.tracks_path = FindCreateDirectory(self.exports_path, "tracks_csv_format").inspect_directory() # datasets - self.dataset_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "dataset")).inspect_directory() + self.dataset_path = FindCreateDirectory(self.exports_path, "dataset").inspect_directory() # models - self.models_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "models")).inspect_directory() + self.models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() # images - self.images_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "images")).inspect_directory() + self.images_path = FindCreateDirectory(self.exports_path, "images").inspect_directory() # reports - self.reports_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "reports")).inspect_directory() + self.reports_path = FindCreateDirectory(self.exports_path, "reports").inspect_directory() def config_file_analysis(self): """ diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py index 3821533df..155d37af6 100644 --- a/acousticbrainz/models/sklearn/classification/classifier_grid.py +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -66,12 +66,9 @@ def train_grid_search_clf(self): logger=self.logger) # save best results for each train process - exports_dir = self.config.get("exports_directory") # paths declaration for saving the grid training results - results_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "results")).inspect_directory() - models_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "models")).inspect_directory() + results_path = FindCreateDirectory(self.exports_path, "results").inspect_directory() + models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) # save the results from each train process step and return the results from that train in a dictionary @@ -105,8 +102,7 @@ def export_best_classifier(self): # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) self.logger.info("{}".format(model)) best_model_name = "best_model_{}.json".format(self.class_name) - exports_dir = self.config.get("exports_directory") - with open(os.path.join(self.exports_path, exports_dir, best_model_name), "w") as best_model: + with open(os.path.join(self.exports_path, best_model_name), "w") as best_model: json.dump(model, best_model, indent=4) self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py index 3cc78881c..2a2e7d772 100644 --- a/acousticbrainz/models/sklearn/classification/evaluation.py +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -34,17 +34,13 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger.debug("Tracks list length: {}".format(len(tracks))) # load project directory and the corresponding save paths - exports_dir = config.get("exports_directory") - dataset_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "dataset")).inspect_directory() - models_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "models")).inspect_directory() - images_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "images")).inspect_directory() + dataset_path = FindCreateDirectory(exports_path, "dataset").inspect_directory() + models_path = FindCreateDirectory(exports_path, "models").inspect_directory() + images_path = FindCreateDirectory(exports_path, "images").inspect_directory() # load best model params and score data - load_best_model_params_score_path = os.path.join(exports_path, exports_dir, "best_model_{}.json".format(class_name)) + load_best_model_params_score_path = os.path.join(exports_path, "best_model_{}.json".format(class_name)) with open(load_best_model_params_score_path) as model_params_score_file: best_params_score_data = json.load(model_params_score_file) @@ -98,13 +94,12 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, pred_folded_list = df_predictions["predictions"].to_list() # export the matrix dictionary from the folded dataset - folded_results_matrix_path = os.path.join(exports_path, exports_dir) folded_matrix_dict = matrix_creation(classes=clf.classes_, tracks=tracks_folded_list, y_actual=y_folded_list, y_hat=pred_folded_list, logger=logger, - export_save_path=folded_results_matrix_path, + export_save_path=exports_path, export_name="folded_dataset_results_matrix.json") # ACCURACIES for each fold @@ -124,13 +119,12 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, concat_save_model_instances_matrix_json(instances_dict=folded_instances_dict, cm_dict=folded_matrix_dict, exports_path=exports_path, - exports_dir=exports_dir, logger=logger, export_name="folded_dataset_instances_cm.json") simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", logger=logger, - export_save_path=folded_results_matrix_path, + export_save_path=exports_path, export_name="folded_simplified_matrix.json", write_mode=True) @@ -152,23 +146,22 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, # prediction for the whole dataset predictions_all = clf.predict(features_prepared) # save the model that is trained to the whole dataset - best_model_path = os.path.join(exports_path, exports_dir, "best_clf_model.pkl") + best_model_path = os.path.join(exports_path, "best_clf_model.pkl") joblib.dump(clf, best_model_path) logger.info("Best model saved.") # export the matrix dictionary from the whole dataset - whole_results_matrix_path = os.path.join(exports_path, exports_dir) whole_matrix_dict = matrix_creation(classes=clf.classes_, tracks=tracks, y_actual=predictions_all, y_hat=y, logger=logger, - export_save_path=whole_results_matrix_path, + export_save_path=exports_path, export_name="whole_dataset_results_matrix.json") simplified_cm_whole = simplified_matrix_export(best_result_file="whole_dataset_results_matrix.json", logger=logger, - export_save_path=whole_results_matrix_path, + export_save_path=exports_path, export_name="whole_dataset_cm_dict.json", write_mode=True) @@ -177,7 +170,6 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, concat_save_model_instances_matrix_json(instances_dict=None, cm_dict=whole_matrix_dict, exports_path=exports_path, - exports_dir=exports_dir, logger=logger, export_name="whole_dataset_instances_cm.json") @@ -192,7 +184,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, ) -def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, exports_dir, logger, export_name): +def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, logger, export_name): """ Save the best model's folded instances and confusion matrix dictionary merged into one dictionary @@ -200,15 +192,12 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat instances_dict: cm_dict: exports_path: - exports_dir: logger: export_name: Returns: """ - best_folds_cm_merge_dict_path = os.path.join(exports_path, exports_dir) - if instances_dict: # in case of the folded dataset where folds exist best_folds_cm_merge_dict = {**instances_dict, **cm_dict} @@ -219,7 +208,7 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat # Serializing json json_object_folds_cm = json.dumps(best_folds_cm_merge_dict, indent=4) # Writing to json - load_file_path = os.path.join(best_folds_cm_merge_dict_path, export_name) + load_file_path = os.path.join(exports_path, export_name) with open(load_file_path, "w") as outfile: outfile.write(json_object_folds_cm) logger.info("Whole folded instaces and matrix dictionary stored successfully.") diff --git a/acousticbrainz/models/sklearn/classification/report_files_export.py b/acousticbrainz/models/sklearn/classification/report_files_export.py index 52c4fc51b..d481e09d6 100644 --- a/acousticbrainz/models/sklearn/classification/report_files_export.py +++ b/acousticbrainz/models/sklearn/classification/report_files_export.py @@ -6,8 +6,7 @@ def export_report(config, name, report, filename, train_class, exports_path): - exports_dir = config.get("exports_directory") - reports_path = FindCreateDirectory(exports_path, os.path.join(exports_dir, "reports")).inspect_directory() + reports_path = FindCreateDirectory(exports_path, "reports").inspect_directory() # take current datetime now = datetime.now() datetime_str_verbose = now.isoformat() diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index 74c7eaba3..afb871577 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -8,7 +8,7 @@ from ..helper_functions.logging_tool import LoggerSetup -def train_class(config, gt_file, exports_directory, c_values, gamma_values, preprocessing_values, log_level): +def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, log_level): exports_path = config["exports_path"] gt_data = GroundTruthLoad(config, gt_file, exports_path, log_level) # tracks shuffled and exported @@ -18,13 +18,6 @@ def train_class(config, gt_file, exports_directory, c_values, gamma_values, prep class_name = gt_data.export_train_class() config["class_name"] = class_name - # project directory where the models and outputs will be saved - if exports_directory is None: - prefix_exports_dir = "exports" - config["exports_directory"] = "{}_{}".format(prefix_exports_dir, class_name) - else: - config["exports_directory"] = exports_directory - config = update_parameters(config=config, c_values=c_values, gamma_values=gamma_values, diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index d11040467..c4d597dab 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -51,7 +51,6 @@ def __init__(self, config, exports_path, name, train_class, mode, level=1): self.mode = mode self.level = level - self.exports_dir = "" self.logs_path = "" def setup_logger(self): @@ -62,9 +61,7 @@ def setup_logger(self): Returns: The logger object. """ - self.exports_dir = self.config.get("exports_directory") - self.logs_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "logs")).inspect_directory() + self.logs_path = FindCreateDirectory(self.exports_path, "logs").inspect_directory() # Create a custom logger logger_object = logging.getLogger(self.name) diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 0907436f6..e38b7b193 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -5,7 +5,7 @@ from ..classification.train_class import train_class -def create_classification_project(ground_truth_file, dataset_dir, project_file=None, exports_directory=None, exports_path=None, +def create_classification_project(ground_truth_file, dataset_dir, project_file=None, exports_path=None, c_values=None, gamma_values=None, preprocessing_values=None, seed=None, jobs=-1, verbose=1, logging="logging.INFO"): """ @@ -16,13 +16,8 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N will be created. Default: None. If None, the tool will create automatically a project file name in form of "project_CLASS_NAME", where CLASS_NAME is the target class as referred to the groundtruth data. - exports_directory: The name (str) of the directory that the results - of the classification project will be save to. Default: None. If None, - the tool will automatically create a directory with the name - "exports_CLASS_NAME", where CLASS_NAME is the target class as referred - to the groundtruth data. - exports_path: The path (str) to the exports directory. Default: None. If - None, the exports directory will be saved inside the app folder. + exports_path: The path (str) where the results of the classification project will be saved to. + Default: None. If None, the exports directory will be saved inside the app folder. seed: The seed (int) of the random shuffle generator. Default: 1 jobs: The cores (int) that will be exploited during the training phase. Default: -1. If -1, all the available cores will be used. @@ -62,14 +57,12 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N print("Exports path: {}".format(exports_path)) project_template["exports_path"] = exports_path - print("Exports directory: {}".format(exports_directory)) - print() print() print("-------------------------------------------------------") print("Loading GroundTruth yaml file:", ground_truth_file) - train_class(project_template, ground_truth_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) + train_class(project_template, ground_truth_file, c_values, gamma_values, preprocessing_values, logging) if __name__ == '__main__': @@ -95,10 +88,6 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N help="Name of the project configuration file (.yaml) will be stored. If not specified " "it takes automatically the name .") - parser.add_argument("-e", "--exportsdir", - dest="exports_directory", - help="Name of the exports directory that the project's results will be stored.") - parser.add_argument("-p", "--path", dest="exports_path", help="Path where the project results will be stored. If empty, the results will be saved in " @@ -130,7 +119,6 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N create_classification_project(ground_truth_file=args.ground_truth_file, dataset_dir=args.dataset_dir, project_file=args.project_file, - exports_directory=args.exports_directory, exports_path=args.exports_path, seed=args.seed, jobs=args.jobs, diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index c34e4f0a4..257d7149f 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -19,7 +19,6 @@ def __init__(self, config, track_low_level, log_level): self.class_name = "" self.exports_path = "" - self.exports_dir = "" self.best_model = "" self.track_feats = dict() @@ -33,11 +32,9 @@ def __init__(self, config, track_low_level, log_level): def load_best_model(self): self.class_name = self.config["class_name"] self.exports_path = self.config["exports_path"] - self.exports_dir = self.config["exports_directory"] # self.exports_path = os.path.join(self.exports_path, "{}_{}".format(self.exports_dir, self.class_name)) best_model_path = os.path.join(self.exports_path, - self.exports_dir, "best_model_{}.json".format(self.class_name)) # best_model_path = os.path.join(self.exports_dir, "models", "model_grid_{}.pkl".format[""]) with open(best_model_path) as json_file: @@ -87,8 +84,7 @@ def preprocessing(self): # best_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(self.best_model["preprocessing"])) # load the best model that is trained to the whole dataset - models_path = FindCreateDirectory(self.exports_path, self.exports_dir).inspect_directory() - best_model_path = os.path.join(models_path, "best_clf_model.pkl") + best_model_path = os.path.join(self.exports_path, "best_clf_model.pkl") clf_loaded = joblib.load(best_model_path) predicted = clf_loaded.predict(features_prepared) predicted_prob = clf_loaded.predict_proba(features_prepared) diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index dabc181b2..62c065fa2 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -207,9 +207,7 @@ def create_df_tracks(self): self.logger.info("There are no NULL values found.") # export shuffled tracks to CSV format - exports_dir = self.config.get("exports_directory") - tracks_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "tracks_csv_format")).inspect_directory() + tracks_path = FindCreateDirectory(self.exports_path, "tracks_csv_format").inspect_directory() self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) self.logger.debug("DF INFO:") self.logger.debug("{}".format(self.df_tracks.info())) diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py index 814f30e77..3e209f3a9 100644 --- a/acousticbrainz/models/sklearn/transformation/transform.py +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -57,9 +57,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - exports_dir = self.config.get("exports_directory") - models_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "models")).inspect_directory() + models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() # clean list print(colored("Cleaning..", "yellow")) diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index ec749db97..e9a50d44e 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -53,9 +53,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - exports_dir = self.config.get("exports_directory") - models_path = FindCreateDirectory(self.exports_path, - os.path.join(exports_dir, "models")).inspect_directory() + models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() # clean list print(colored("Cleaning..", "yellow")) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 3db9f67ff..b699df69a 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -130,7 +130,6 @@ def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, sto create_classification_project(ground_truth_file=ground_truth_file, dataset_dir=dataset_dir, project_file=eval_job["id"], - exports_directory=eval_job["id"], exports_path=eval_location, c_values=options.get("c_values", []), gamma_values=options.get("gamma_values", []), @@ -139,8 +138,7 @@ def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, sto logging.info("Saving results...") results = load_best_results_sklearn(exported_path=eval_location, - project_file=eval_job["id"], - exports_directory=eval_job["id"]) + project_file=eval_job["id"]) db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ "project_path": eval_location, "parameters": results["parameters"], @@ -150,7 +148,7 @@ def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, sto })) -def load_best_results_sklearn(exported_path, project_file, exports_directory): +def load_best_results_sklearn(exported_path, project_file): project_conf_file_path = os.path.join(exported_path, "{}.yaml".format(project_file)) logging.info("Config file path: {}".format(project_conf_file_path)) with open(project_conf_file_path) as fp: @@ -176,10 +174,9 @@ def load_best_results_sklearn(exported_path, project_file, exports_directory): # data_fold_simplified_matrix = json.load(json_file_simple_cm) # export the matrix dictionary from the folded dataset - folded_results_matrix_path = os.path.join(exported_path, exports_directory) simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", logger=logging, - export_save_path=folded_results_matrix_path, + export_save_path=exported_path, export_name="simplified_cm.json", write_mode=False) From 900207dd72c331fa68237b563f57fb62c295be00 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 22:58:37 +0530 Subject: [PATCH 49/64] Fix more references to duplicate uuid in export path --- dataset_eval/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index b699df69a..0934afa3f 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -156,13 +156,13 @@ def load_best_results_sklearn(exported_path, project_file): logging.info("Model: {}".format(project_data['class_name'])) # load the best model dictionary - best_model_path = os.path.join(exported_path, project_file, "best_model_{}.json".format(project_data['class_name'])) + best_model_path = os.path.join(exported_path, "best_model_{}.json".format(project_data['class_name'])) logging.info("Best model path: {}".format(best_model_path)) with open(best_model_path) as json_file: data_best_model = json.load(json_file) # load the best model's instances and matrix dictionary - fold_matrix_path = os.path.join(exported_path, project_file, "folded_dataset_instances_cm.json") + fold_matrix_path = os.path.join(exported_path, "folded_dataset_instances_cm.json") logging.info("Best Instances and Matrix JSON path: {}".format(fold_matrix_path)) with open(fold_matrix_path) as json_file_cm: data_fold_matrix = json.load(json_file_cm) From fda2d9dfb72d39de60a912a51c3c8adb476db78a Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 06:33:52 +0530 Subject: [PATCH 50/64] Simplify FindCreateDirectory to a function Making a class to just call a util method does not seem useful. Also, the directories are always created by the ClassifcationTaskManager and DatasetExplorer so we can just refer to their path with os.path.join elsewher. We already have a similar function in AB utils but that is used on Python 2.7 which does not have exist_ok arg available. Hence, keeping this. --- .../classification_task_manager.py | 16 ++++++++-------- .../sklearn/classification/classifier_grid.py | 5 ++--- .../models/sklearn/classification/evaluation.py | 7 +++---- .../classification/report_files_export.py | 4 +--- .../sklearn/helper_functions/logging_tool.py | 3 +-- .../models/sklearn/helper_functions/utils.py | 17 ++++++----------- acousticbrainz/models/sklearn/model/predict.py | 2 +- .../sklearn/transformation/load_ground_truth.py | 4 ++-- .../models/sklearn/transformation/transform.py | 3 +-- .../transformation/transform_predictions.py | 3 +-- 10 files changed, 26 insertions(+), 38 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index 34067608a..ff5dc280e 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -3,7 +3,7 @@ from termcolor import colored from datetime import datetime -from ..helper_functions.utils import FindCreateDirectory, TrainingProcesses +from ..helper_functions.utils import create_directory, TrainingProcesses from ..classification.classification_task import ClassificationTask from ..helper_functions.logging_tool import LoggerSetup @@ -64,19 +64,19 @@ def files_existence(self): """ # main exports # train results exports - self.results_path = FindCreateDirectory(self.exports_path, "results").inspect_directory() + self.results_path = create_directory(self.exports_path, "results") # logs - self.logs_path = FindCreateDirectory(self.exports_path, "logs").inspect_directory() + self.logs_path = create_directory(self.exports_path, "logs") # tracks - self.tracks_path = FindCreateDirectory(self.exports_path, "tracks_csv_format").inspect_directory() + self.tracks_path = create_directory(self.exports_path, "tracks_csv_format") # datasets - self.dataset_path = FindCreateDirectory(self.exports_path, "dataset").inspect_directory() + self.dataset_path = create_directory(self.exports_path, "dataset") # models - self.models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() + self.models_path = create_directory(self.exports_path, "models") # images - self.images_path = FindCreateDirectory(self.exports_path, "images").inspect_directory() + self.images_path = create_directory(self.exports_path, "images") # reports - self.reports_path = FindCreateDirectory(self.exports_path, "reports").inspect_directory() + self.reports_path = create_directory(self.exports_path, "reports") def config_file_analysis(self): """ diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py index 155d37af6..405b5f8a7 100644 --- a/acousticbrainz/models/sklearn/classification/classifier_grid.py +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -7,7 +7,6 @@ from sklearn.model_selection import KFold from ..transformation.transform import Transform -from ..helper_functions.utils import FindCreateDirectory from ..helper_functions.logging_tool import LoggerSetup @@ -67,8 +66,8 @@ def train_grid_search_clf(self): # save best results for each train process # paths declaration for saving the grid training results - results_path = FindCreateDirectory(self.exports_path, "results").inspect_directory() - models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() + results_path = os.path.join(self.exports_path, "results") + models_path = os.path.join(self.exports_path, "models") best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) # save the results from each train process step and return the results from that train in a dictionary diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py index 2a2e7d772..576ae24f1 100644 --- a/acousticbrainz/models/sklearn/classification/evaluation.py +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -11,7 +11,6 @@ from sklearn.metrics import confusion_matrix, classification_report import joblib -from ..helper_functions.utils import FindCreateDirectory from ..transformation.transform import Transform from ..classification.report_files_export import export_report from ..helper_functions.logging_tool import LoggerSetup @@ -35,9 +34,9 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, # load project directory and the corresponding save paths - dataset_path = FindCreateDirectory(exports_path, "dataset").inspect_directory() - models_path = FindCreateDirectory(exports_path, "models").inspect_directory() - images_path = FindCreateDirectory(exports_path, "images").inspect_directory() + dataset_path = os.path.join(exports_path, "dataset") + models_path = os.path.join(exports_path, "models") + images_path = os.path.join(exports_path, "images") # load best model params and score data load_best_model_params_score_path = os.path.join(exports_path, "best_model_{}.json".format(class_name)) diff --git a/acousticbrainz/models/sklearn/classification/report_files_export.py b/acousticbrainz/models/sklearn/classification/report_files_export.py index d481e09d6..9fef07aad 100644 --- a/acousticbrainz/models/sklearn/classification/report_files_export.py +++ b/acousticbrainz/models/sklearn/classification/report_files_export.py @@ -2,11 +2,9 @@ from datetime import datetime from termcolor import colored -from ..helper_functions.utils import FindCreateDirectory - def export_report(config, name, report, filename, train_class, exports_path): - reports_path = FindCreateDirectory(exports_path, "reports").inspect_directory() + reports_path = os.path.join(exports_path, "reports") # take current datetime now = datetime.now() datetime_str_verbose = now.isoformat() diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index c4d597dab..a0779ad15 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -10,7 +10,6 @@ """ import logging import os -from ..helper_functions.utils import FindCreateDirectory # # load yaml configuration file to a dict # config_data = load_yaml() @@ -61,7 +60,7 @@ def setup_logger(self): Returns: The logger object. """ - self.logs_path = FindCreateDirectory(self.exports_path, "logs").inspect_directory() + self.logs_path = os.path.join(self.exports_path, "logs") # Create a custom logger logger_object = logging.getLogger(self.name) diff --git a/acousticbrainz/models/sklearn/helper_functions/utils.py b/acousticbrainz/models/sklearn/helper_functions/utils.py index 7cca3b1ba..6c892e289 100644 --- a/acousticbrainz/models/sklearn/helper_functions/utils.py +++ b/acousticbrainz/models/sklearn/helper_functions/utils.py @@ -51,17 +51,12 @@ def check_df_info(self): print(self.df_check["category"].value_counts()) -class FindCreateDirectory: - def __init__(self, exports_path, directory): - self.exports_path = exports_path - self.directory = directory - - def inspect_directory(self): - # find dynamically the current script directory - full_path = os.path.join(self.exports_path, self.directory) - # create path directories if not exist --> else return the path - os.makedirs(full_path, exist_ok=True) - return full_path +def create_directory(exports_path, directory): + # find dynamically the current script directory + full_path = os.path.join(exports_path, directory) + # create path directories if not exist --> else return the path + os.makedirs(full_path, exist_ok=True) + return full_path class LogsDeleter: diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index 257d7149f..1e3a1c8d0 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -5,7 +5,7 @@ import joblib import json import pandas as pd -from ..helper_functions.utils import load_yaml, FindCreateDirectory +from ..helper_functions.utils import load_yaml from ..transformation.utils_preprocessing import flatten_dict_full from ..transformation.transform_predictions import TransformPredictions from ..helper_functions.logging_tool import LoggerSetup diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 62c065fa2..a474f77ef 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -4,7 +4,7 @@ from pprint import pprint from termcolor import colored import random -from ..helper_functions.utils import load_yaml, FindCreateDirectory +from ..helper_functions.utils import create_directory from ..transformation.load_low_level import FeaturesDf from ..helper_functions.logging_tool import LoggerSetup @@ -207,7 +207,7 @@ def create_df_tracks(self): self.logger.info("There are no NULL values found.") # export shuffled tracks to CSV format - tracks_path = FindCreateDirectory(self.exports_path, "tracks_csv_format").inspect_directory() + tracks_path = create_directory(self.exports_path, "tracks_csv_format") self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) self.logger.debug("DF INFO:") self.logger.debug("{}".format(self.df_tracks.info())) diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py index 3e209f3a9..33b3a3e41 100644 --- a/acousticbrainz/models/sklearn/transformation/transform.py +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -5,7 +5,6 @@ import os import six -from ..helper_functions.utils import FindCreateDirectory from ..transformation.utils_preprocessing import list_descr_handler from ..transformation.utils_preprocessing import feats_selector_list from sklearn.base import BaseEstimator, TransformerMixin @@ -57,7 +56,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() + models_path = os.path.join(self.exports_path, "models") # clean list print(colored("Cleaning..", "yellow")) diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index e9a50d44e..c24a567aa 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -6,7 +6,6 @@ import six from sklearn.base import BaseEstimator, TransformerMixin -from ..helper_functions.utils import FindCreateDirectory from ..transformation.utils_preprocessing import list_descr_handler from ..transformation.utils_preprocessing import feats_selector_list from ..helper_functions.logging_tool import LoggerSetup @@ -53,7 +52,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - models_path = FindCreateDirectory(self.exports_path, "models").inspect_directory() + models_path = os.path.join(self.exports_path, "models") # clean list print(colored("Cleaning..", "yellow")) From 4f9588cee2ab4ee9474f4b07555f60285562f600 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 06:48:54 +0530 Subject: [PATCH 51/64] Always check and create directory for logging if needed Revert the change from previous commit for logging directory because we setup logging before trying to create all other output directories. --- acousticbrainz/models/sklearn/helper_functions/logging_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index a0779ad15..547ce7278 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -21,6 +21,7 @@ # log_path = os.path.join(current_d, "logs_dir") # else: # log_path = FindCreateDirectory(config_data["log_directory"]).inspect_directory() +from acousticbrainz.models.sklearn.helper_functions.utils import create_directory class LoggerSetup: @@ -60,7 +61,7 @@ def setup_logger(self): Returns: The logger object. """ - self.logs_path = os.path.join(self.exports_path, "logs") + self.logs_path = create_directory(self.exports_path, "logs") # Create a custom logger logger_object = logging.getLogger(self.name) From cb14dcdbb347ba5116e58880dc46d1138215fb37 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 16:25:01 +0530 Subject: [PATCH 52/64] Remove unused classes --- .../classification/classifier_basic.py | 40 ------------------- .../models/sklearn/helper_functions/utils.py | 20 ---------- 2 files changed, 60 deletions(-) delete mode 100644 acousticbrainz/models/sklearn/classification/classifier_basic.py diff --git a/acousticbrainz/models/sklearn/classification/classifier_basic.py b/acousticbrainz/models/sklearn/classification/classifier_basic.py deleted file mode 100644 index 86e7b9b24..000000000 --- a/acousticbrainz/models/sklearn/classification/classifier_basic.py +++ /dev/null @@ -1,40 +0,0 @@ -from sklearn.svm import SVC - - -class TrainClassifier: - """ - This class initiates a simple classifier. It is used for initiating a simple model from - sklearn or other APIs in the future, like TensorFlow. - TODO: Initiating other models from sklearn (e.g. Random Forests, Decision Tree, etc.) - """ - def __init__(self, classifier, params): - """ - Args: - classifier: the classifier name (str) to be set. A string that is among the valid - classifiers list. - params: The parameters of the classifier (dictionary). - - Returns: - The model object that is initiated (including its set of parameters) - """ - self.classifier = classifier - self.params = params - - def model(self): - validClassifiers = ["NN", "svm", "rf"] - if self.classifier not in validClassifiers: - raise ValueError("The classifier name must be valid.") - - if self.classifier == "svm": - param_C = self.params["C"] - param_gamma = self.params["gamma"] - param_class_weight = self.params["class_weight"] - param_kernel = self.params["kernel"] - model = SVC(C=param_C, # 2 ** param_C - gamma=param_gamma, # 2 ** param_gamma - kernel=param_kernel, - class_weight=param_class_weight, - probability=True) - return model - else: - return None diff --git a/acousticbrainz/models/sklearn/helper_functions/utils.py b/acousticbrainz/models/sklearn/helper_functions/utils.py index 6c892e289..ebd82dc9c 100644 --- a/acousticbrainz/models/sklearn/helper_functions/utils.py +++ b/acousticbrainz/models/sklearn/helper_functions/utils.py @@ -59,26 +59,6 @@ def create_directory(exports_path, directory): return full_path -class LogsDeleter: - def __init__(self, config, train_class): - self.config = config - self.train_class = train_class - - def delete_logs(self): - # delete logs for specific model and class on a new run - if self.config["delete_logs"] is True: - print("Evaluation logs deletion is turned to ON.") - dir_name = os.path.join(os.getcwd(), "evaluations") - evaluations_list = os.listdir(dir_name) - for item in evaluations_list: - if item.endswith(".txt"): - if item.startswith("{}_{}".format(self.train_class, self.config["train_kind"])): - os.remove(os.path.join(dir_name, item)) - print("Previous evaluation logs deleted successfully.") - else: - print("Evaluation logs deletion is turned to OFF.") - - def change_weights_val(i): """ Is is used in the TrainingProcesses class. It is used to transform each value of From a8a629591fb073f723b6ff307461f09160a6551e Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 17:50:33 +0530 Subject: [PATCH 53/64] First step in refactoring logging Instead of setting up logger again and again, we pass the logger around manually. In subsequent steps, this will be entirely replaced to utilise python's hierarchical logging. Can't do this just now because the file handler requires the dataset name. --- .../classification/classification_task.py | 19 +-- .../classification_task_manager.py | 16 +-- .../sklearn/classification/classifier_grid.py | 17 +-- .../sklearn/classification/evaluation.py | 11 +- .../sklearn/classification/train_class.py | 19 +-- .../sklearn/helper_functions/logging_tool.py | 127 ++++++------------ .../sklearn/model/classification_project.py | 14 +- .../models/sklearn/model/predict.py | 16 +-- .../transformation/load_ground_truth.py | 17 +-- .../sklearn/transformation/load_low_level.py | 16 +-- .../sklearn/transformation/transform.py | 16 +-- .../transformation/transform_predictions.py | 16 +-- 12 files changed, 85 insertions(+), 219 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py index 87b845ab0..455230a92 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task.py +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -2,7 +2,6 @@ import json from ..classification.classifier_grid import TrainGridClassifier from ..classification.evaluation import evaluation -from ..helper_functions.logging_tool import LoggerSetup class ClassificationTask: @@ -12,7 +11,7 @@ class ClassificationTask: to the configuration file declared class to train the model and then it uses that model for evaluation. """ - def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, log_level): + def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, logger): """ Args: config: The configuration data that contain the settings from the configuration @@ -33,24 +32,14 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex self.config = config self.classifier = classifier self.train_class = train_class - self.log_level = log_level self.X = X self.y = y self.training_processes = training_processes self.exports_path = exports_path self.tracks = tracks - self.logger = "" + self.logger = logger - self.setting_logger() - - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() def run(self): # grid search train @@ -63,7 +52,7 @@ def run(self): y=self.y, tr_processes=self.training_processes, exports_path=self.exports_path, - log_level=self.log_level + logger=logger ) grid_svm_train.train_grid_search_clf() grid_svm_train.export_best_classifier() @@ -86,5 +75,5 @@ def run(self): tracks=self.tracks, process=best_model["preprocessing"], exports_path=self.exports_path, - log_level=self.log_level + logger=self.logger ) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index ff5dc280e..d5c49ee98 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -5,7 +5,6 @@ from ..helper_functions.utils import create_directory, TrainingProcesses from ..classification.classification_task import ClassificationTask -from ..helper_functions.logging_tool import LoggerSetup validClassifiers = ["svm", "NN"] @@ -21,7 +20,7 @@ class ClassificationTaskManager: with their corresponding preprocessing steps and parameters declaration for the classifier, and executes the classification task for each step. """ - def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): + def __init__(self, config, train_class, X, y, tracks, exports_path, logger): """ Args: config: The configuration file name. @@ -35,7 +34,7 @@ def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): self.y = y self.tracks = tracks self.exports_path = exports_path - self.log_level = log_level + self.logger = logger self.results_path = "" self.logs_path = "" @@ -45,18 +44,9 @@ def __init__(self, config, train_class, X, y, tracks, exports_path, log_level): self.images_path = "" self.reports_path = "" - self.logger = "" - self.setting_logger() self.files_existence() self.config_file_analysis() - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() def files_existence(self): """ @@ -119,7 +109,7 @@ def apply_processing(self): y=self.y, exports_path=self.exports_path, tracks=self.tracks, - log_level=self.log_level + logger=self.logger ) try: task.run() diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py index 405b5f8a7..5c36ded43 100644 --- a/acousticbrainz/models/sklearn/classification/classifier_grid.py +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -7,11 +7,10 @@ from sklearn.model_selection import KFold from ..transformation.transform import Transform -from ..helper_functions.logging_tool import LoggerSetup class TrainGridClassifier: - def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, log_level): + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, logger): self.config = config self.classifier = classifier self.class_name = class_name @@ -19,21 +18,11 @@ def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_p self.y = y self.tr_processes = tr_processes self.exports_path = exports_path - self.log_level = log_level - self.logger = "" + self.logger = logger self.best_models_list = [] # self.train_grid_search_clf() - self.setting_logger() - - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.class_name), - train_class=self.class_name, - mode="a", - level=self.log_level).setup_logger() def train_grid_search_clf(self): process_counter = 1 @@ -54,7 +43,7 @@ def train_grid_search_clf(self): process=tr_process["preprocess"], train_class=self.class_name, exports_path=self.exports_path, - log_level=self.log_level).post_processing() + logger=self.logger).post_processing() # train the grid classifier and return the trained model gsvc = train_grid(tr_process=tr_process, diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py index 576ae24f1..a26f5bf88 100644 --- a/acousticbrainz/models/sklearn/classification/evaluation.py +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -13,18 +13,11 @@ from ..transformation.transform import Transform from ..classification.report_files_export import export_report -from ..helper_functions.logging_tool import LoggerSetup from ..classification.matrix_creation import matrix_creation, simplified_matrix_export -def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, log_level): +def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger): print(colored("------ EVALUATION and FOLDING ------", "yellow")) - logger = LoggerSetup(config=config, - exports_path=exports_path, - name="train_model_{}".format(class_name), - train_class=class_name, - mode="a", - level=log_level).setup_logger() logger.info("---- Folded evaluation of the model in the dataset ----") logger.info("number of folds set to config: {}".format(n_fold)) @@ -66,7 +59,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, process=process, train_class=class_name, exports_path=exports_path, - log_level=log_level).post_processing() + logger=logger).post_processing() logger.debug("Features prepared shape: {}".format(features_prepared.shape)) # Starting Training, Predictions for each fold diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index afb871577..8e92a0f1f 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -2,10 +2,10 @@ from termcolor import colored import yaml +from ..helper_functions.logging_tool import setup_logger from ..transformation.load_ground_truth import GroundTruthLoad from ..classification.classification_task_manager import ClassificationTaskManager from ..transformation.load_ground_truth import DatasetExporter -from ..helper_functions.logging_tool import LoggerSetup def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, log_level): @@ -23,12 +23,13 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l gamma_values=gamma_values, preprocessing_values=preprocessing_values) - logger = LoggerSetup(config=config, - exports_path=exports_path, - name="train_model_{}".format(class_name), - train_class=class_name, - mode="w", - level=log_level).setup_logger() + logger = setup_logger( + exports_path=exports_path, + name="train_model_{}".format(class_name), + mode="w", + level=log_level + ) + logger.info("---- TRAINING FOR THE {} MODEL HAS JUST STARTED ----".format(class_name)) logger.debug("Type of exported GT data exported: {}".format(type(tracks_listed_shuffled))) @@ -51,7 +52,7 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l tracks_list=tracks_listed_shuffled, train_class=class_name, exports_path=exports_path, - log_level=log_level + logger=logger ).create_df_tracks() logger.debug("Types of exported files from GT:") logger.debug("Type of features: {}".format(type(features))) @@ -64,7 +65,7 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l y=labels, tracks=tracks, exports_path=exports_path, - log_level=log_level) + logger=logger) classification_time = model_manage.apply_processing() print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) logger.info("Classification ended successfully in {} minutes.".format(classification_time)) diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index 547ce7278..0c764de0e 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -1,108 +1,57 @@ """ -This file consists of the LoggerSetup class that is used for logging. +This file consists of the setup_logger methof that is used for logging. setup_logger() +method set up a new logger object with the related configurations. -Here, the LoggerSetup and its embedded setup_logger() method set up a new logger object with the related configurations. - - Typical usage example: - - logging_object = LoggerSetup(logger_name, logging_file_location, level_of_logging) - logger = logging_object.setup_logger() +Typical usage example: + logger = setup_logger(logger_name, logging_file_location, level_of_logging) """ import logging import os -# # load yaml configuration file to a dict -# config_data = load_yaml() -# # If log directory does not exist, create one -# current_d = os.getcwd() -# if config_data["log_directory"] is None or config_data["log_directory"] is None: -# if not os.path.exists(os.path.join(current_d, "logs_dir")): -# os.makedirs(os.path.join(current_d, "logs_dir")) -# log_path = os.path.join(current_d, "logs_dir") -# else: -# log_path = FindCreateDirectory(config_data["log_directory"]).inspect_directory() from acousticbrainz.models.sklearn.helper_functions.utils import create_directory -class LoggerSetup: - """It sets up a logging object. - - Attributes: - name: The name of the logger. - log_file: The path of the logging file export. - level: An integer that defines the logging level. +def setup_logger(exports_path, name, mode, level=logging.INFO): """ - def __init__(self, config, exports_path, name, train_class, mode, level=1): - """ - Inits the logger object with the corresponding parameters. + Function to set up as many loggers as you want. It exports the logging results to a file + in the relevant path that is determined by the configuration file. - Args: - config: The configuration data (dict). - exports_path: The path (str) the logging exports will be exported. - name: The name (str) of the logger. - train_class: The name of the target class (str) - level: The level (int) of the logging. Defaults to 1. - mode: The mode (str) translated in write, append. Valid values ("w", "a") - """ - self.config = config - self.exports_path = exports_path - self.name = name - self.train_class = train_class - self.mode = mode - self.level = level + Args: + exports_path: The path (str) the logging exports will be exported. + name: The name (str) of the logger. + level: The level (int) of the logging. Defaults to logging.INFO. + mode: The mode (str) translated in write, append. Valid values ("w", "a") - self.logs_path = "" - - def setup_logger(self): - """ - Function to set up as many loggers as you want. It exports the logging results to a file - in the relevant path that is determined by the configuration file. - - Returns: - The logger object. - """ - self.logs_path = create_directory(self.exports_path, "logs") + Returns: + The logger object. + """ + logs_path = create_directory(exports_path, "logs") - # Create a custom logger - logger_object = logging.getLogger(self.name) + # Create a custom logger + logger = logging.getLogger(name) - # Create handlers - c_handler = logging.StreamHandler() - f_handler = logging.FileHandler(os.path.join(self.logs_path, "{}.log".format(self.name)), mode=self.mode) + # Create handlers + c_handler = logging.StreamHandler() + f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(name)), mode=mode) - # Create formatters and add it to handlers - c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') - f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - c_handler.setFormatter(c_format) - f_handler.setFormatter(f_format) + # Create formatters and add it to handlers + c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') + f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + c_handler.setFormatter(c_format) + f_handler.setFormatter(f_format) - # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient - # when debugging and the code includes the logger initialization - if logger_object.hasHandlers(): - logger_object.handlers.clear() + # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient + # when debugging and the code includes the logger initialization + if logger.hasHandlers(): + logger.handlers.clear() - # Add handlers to the logger - logger_object.addHandler(c_handler) - logger_object.addHandler(f_handler) + # Add handlers to the logger + logger.addHandler(c_handler) + logger.addHandler(f_handler) - if self.level is None: - logger_object.setLevel(logging.INFO) - elif self.level == "logging.DEBUG": - logger_object.setLevel(logging.DEBUG) - elif self.level == "logging.INFO": - logger_object.setLevel(logging.INFO) - elif self.level == "logging.WARNING": - logger_object.setLevel(logging.WARNING) - elif self.level == "logging.ERROR": - logger_object.setLevel(logging.ERROR) - elif self.level == "logging.CRITICAL": - logger_object.setLevel(logging.CRITICAL) - else: - print("Please define correct one of the Debug Levels:\n" - "logging.DEBUG: DEBUG\n" - "logging.INFO: INFO\n" - "logging.WARNING: WARNING\n" - "logging.ERROR: ERROR\n" - "logging.CRITICAL: CRITICAL") + if level is None: + logger.setLevel(logging.INFO) + else: + logger.setLevel(level) - return logger_object + return logger diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index e38b7b193..8e4c583b4 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -7,7 +7,7 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=None, exports_path=None, c_values=None, gamma_values=None, preprocessing_values=None, - seed=None, jobs=-1, verbose=1, logging="logging.INFO"): + seed=None, jobs=-1, verbose=1, logging="INFO"): """ Args: ground_truth_file: The path (str) to the groundtruth yaml file of the dataset. It is required. @@ -24,9 +24,8 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N verbose: The verbosity (int) of the printed messages where this function is available (for example in sklearn's GridSearch algorithm). Default: 1. The higher the number the higher the verbosity. - logging: The level (str) of the logging prints. Default: "logging.INFO". - Available values: logging.DEBUG, logging.INFO, logging.WARNING, - logging.ERROR, logging.CRITICAL. + logging: The level (str) of the logging prints. Default: "INFO". + Available values: DEBUG, INFO, WARNING, ERROR, CRITICAL. """ try: path_template = os.path.dirname(os.path.realpath(__file__)) @@ -109,9 +108,10 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N type=int) parser.add_argument("-l", "--logging", - default="logging.INFO", - help="The logging level that will be printed logging.DEBUG, logging.INFO, logging.WARNING, " - "logging.ERROR, logging.CRITICAL).", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="The logging level that will be printed DEBUG, INFO, WARNING, " + "ERROR, CRITICAL", type=str) args = parser.parse_args() diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index 1e3a1c8d0..46c750587 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -8,7 +8,7 @@ from ..helper_functions.utils import load_yaml from ..transformation.utils_preprocessing import flatten_dict_full from ..transformation.transform_predictions import TransformPredictions -from ..helper_functions.logging_tool import LoggerSetup +from ..helper_functions.logging_tool import setup_logger class Predict: @@ -41,12 +41,12 @@ def load_best_model(self): self.best_model = json.load(json_file) def preprocessing(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="predict_{}".format(self.class_name), - train_class=self.class_name, - mode="w", - level=self.log_level).setup_logger() + self.logger = setup_logger( + exports_path=self.exports_path, + name="predict_{}".format(self.class_name), + mode="w", + level=self.log_level + ) self.logger.info("Best model:") self.logger.info(self.best_model) @@ -74,7 +74,7 @@ def preprocessing(self): process=self.best_model["preprocessing"], train_class=self.class_name, exports_path=self.exports_path, - log_level=self.log_level + logger=self.logger ).post_processing() self.logger.debug("Features shape after preparation: {}".format(features_prepared.shape)) diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index a474f77ef..78922830f 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -6,7 +6,6 @@ import random from ..helper_functions.utils import create_directory from ..transformation.load_low_level import FeaturesDf -from ..helper_functions.logging_tool import LoggerSetup class GroundTruthLoad: @@ -132,29 +131,19 @@ class DatasetExporter: """ TODO: Description """ - def __init__(self, config, tracks_list, train_class, exports_path, log_level): + def __init__(self, config, tracks_list, train_class, exports_path, logger): self.config = config self.tracks_list = tracks_list self.train_class = train_class self.exports_path = exports_path - self.log_level = log_level + self.logger = logger self.dataset_dir = "" self.class_dir = "" self.df_tracks = pd.DataFrame() self.df_feats = pd.DataFrame() self.y = [] - self.logger = "" - - self.setting_logger() - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() def create_df_tracks(self): """ @@ -219,7 +208,7 @@ def create_df_tracks(self): list_path_tracks=tracks_existing_path_list, config=self.config, exports_path=self.exports_path, - log_level=self.log_level, + logger=self.logger, ).create_low_level_df() self.y = self.df_tracks[self.train_class].values diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py index 3a0ad3ab0..51f319db8 100644 --- a/acousticbrainz/models/sklearn/transformation/load_low_level.py +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -2,7 +2,6 @@ import json import pandas as pd from ..transformation.utils_preprocessing import flatten_dict_full -from ..helper_functions.logging_tool import LoggerSetup class FeaturesDf: @@ -12,29 +11,18 @@ class FeaturesDf: df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, label, etc. """ - def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, log_level): + def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, logger): self.df_tracks = df_tracks self.train_class = train_class self.list_path_tracks = list_path_tracks self.config = config self.exports_path = exports_path - self.log_level = log_level + self.logger = logger self.list_feats_tracks = [] self.counter_items_transformed = 0 self.df_feats_tracks = pd.DataFrame() self.df_feats_label = pd.DataFrame() - self.logger = "" - - self.setting_logger() - - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() def create_low_level_df(self): """ diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py index 33b3a3e41..2396bcd97 100644 --- a/acousticbrainz/models/sklearn/transformation/transform.py +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -11,7 +11,6 @@ from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, QuantileTransformer from sklearn.pipeline import FeatureUnion from sklearn.pipeline import Pipeline -from ..helper_functions.logging_tool import LoggerSetup # avoid the module's method call deprecation @@ -22,13 +21,13 @@ class Transform: - def __init__(self, config, df_feats, process, train_class, exports_path, log_level): + def __init__(self, config, df_feats, process, train_class, exports_path, logger): self.config = config self.df_feats = df_feats self.process = process self.train_class = train_class self.exports_path = exports_path - self.log_level = log_level + self.logger = logger self.list_features = [] self.feats_cat_list = [] @@ -37,16 +36,7 @@ def __init__(self, config, df_feats, process, train_class, exports_path, log_lev self.df_num = pd.DataFrame() self.feats_prepared = [] - self.logger = "" - self.setting_logger() - - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="train_model_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() + def post_processing(self): print(colored("PROCESS: {}".format(self.process), "cyan")) diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index c24a567aa..c14402fca 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -8,7 +8,6 @@ from sklearn.base import BaseEstimator, TransformerMixin from ..transformation.utils_preprocessing import list_descr_handler from ..transformation.utils_preprocessing import feats_selector_list -from ..helper_functions.logging_tool import LoggerSetup # avoid the module's method call deprecation try: @@ -18,30 +17,19 @@ class TransformPredictions: - def __init__(self, config, df_feats, process, train_class, exports_path, log_level): + def __init__(self, config, df_feats, process, train_class, exports_path, logger): self.config = config self.df_feats = df_feats self.process = process self.train_class = train_class self.exports_path = exports_path - self.log_level = log_level - - self.logger = "" + self.logger = logger self.list_features = [] self.feats_cat_list = [] self.feats_num_list = [] self.feats_prepared = [] - self.setting_logger() - - def setting_logger(self): - self.logger = LoggerSetup(config=self.config, - exports_path=self.exports_path, - name="predict_{}".format(self.train_class), - train_class=self.train_class, - mode="a", - level=self.log_level).setup_logger() def post_processing(self): print(colored("PROCESS: {}".format(self.process), "cyan")) From baaf96a03177a156fedbe46f0afaef07027a3c01 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 20:15:23 +0530 Subject: [PATCH 54/64] Move sklearn argparse scripts to click --- .../sklearn/model/classification_project.py | 63 --------------- .../models/sklearn/model/predict.py | 38 +-------- sklearn_manage.py | 77 +++++++++++++++++++ 3 files changed, 78 insertions(+), 100 deletions(-) create mode 100644 sklearn_manage.py diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py index 8e4c583b4..6ac5bb887 100644 --- a/acousticbrainz/models/sklearn/model/classification_project.py +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -1,5 +1,4 @@ import os -import argparse from ..helper_functions.utils import load_yaml import time from ..classification.train_class import train_class @@ -62,65 +61,3 @@ def create_classification_project(ground_truth_file, dataset_dir, project_file=N print("Loading GroundTruth yaml file:", ground_truth_file) train_class(project_template, ground_truth_file, c_values, gamma_values, preprocessing_values, logging) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Generates a project configuration file given a filelist, a groundtruth file, ' - 'and the directories to store the datasets and the results files. ' - 'The script has a parameter to specify the project template to use. ' - 'If it is not specified, it will try to guess the appropriated one from the ' - 'essentia version found on the descriptor files.') - - parser.add_argument("-g", "--groundtruth", - dest="ground_truth_file", - help="Path of the dataset's groundtruth file/s.", - required=True) - - parser.add_argument("-d", "--datasetsdir", - dest="dataset_dir", - help="Path of the main datasets dir containing .json file/s.", - required=True) - - parser.add_argument("-f", "--file", - dest="project_file", - help="Name of the project configuration file (.yaml) will be stored. If not specified " - "it takes automatically the name .") - - parser.add_argument("-p", "--path", - dest="exports_path", - help="Path where the project results will be stored. If empty, the results will be saved in " - "the main app directory.") - - parser.add_argument("-s", "--seed", - default=None, - help="Seed is used to generate the random shuffled dataset applied later to folding.", - type=int) - - parser.add_argument("-j", "--jobs", - default=-1, - help="Parallel jobs. Set to -1 to use all the available cores", - type=int) - - parser.add_argument("-v", "--verbose", - default=1, - help="Controls the verbosity: the higher, the more messages.", - type=int) - - parser.add_argument("-l", "--logging", - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - help="The logging level that will be printed DEBUG, INFO, WARNING, " - "ERROR, CRITICAL", - type=str) - - args = parser.parse_args() - - create_classification_project(ground_truth_file=args.ground_truth_file, - dataset_dir=args.dataset_dir, - project_file=args.project_file, - exports_path=args.exports_path, - seed=args.seed, - jobs=args.jobs, - verbose=args.verbose, - logging=args.logging) diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index 46c750587..0e9d568fe 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -107,7 +107,7 @@ def preprocessing(self): return predict_list -def prediction(exports_path, project_file, mbid, log_level="logging.INFO"): +def prediction(exports_path, project_file, mbid, log_level="INFO"): # if empty, path is declared as the app's main directory try: project_data = load_yaml(exports_path, "{}.yaml".format(project_file)) @@ -130,39 +130,3 @@ def prediction(exports_path, project_file, mbid, log_level="logging.INFO"): log_level=log_level ) prediction_track.preprocessing() - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='Prediction of a track.') - - parser.add_argument("-p", "--path", - dest="exports_path", - help="Path where the project file (.yaml) is stored.", - required=True) - - parser.add_argument("-f", "--file", - dest="project_file", - help="Name of the project configuration file (.yaml) that is to be loaded. The .yaml at the" - "end of the file is not necessary. Just put the name of the file.", - required=True) - - parser.add_argument("-t", "--track", - dest="mbid", - help="MBID of the the low-level data from the AcousticBrainz API.", - required=True) - - parser.add_argument("-l", "--logging", - dest="log_level", - default="logging.INFO", - help="The logging level that will be printed logging.DEBUG, logging.INFO, logging.WARNING, " - "logging.ERROR, logging.CRITICAL).", - type=str) - - args = parser.parse_args() - - prediction(exports_path=args.exports_path, - project_file=args.project_file, - mbid=args.mbid, - log_level=args.log_level) diff --git a/sklearn_manage.py b/sklearn_manage.py new file mode 100644 index 000000000..9593d2f13 --- /dev/null +++ b/sklearn_manage.py @@ -0,0 +1,77 @@ +import click + +from acousticbrainz.models.sklearn.model.classification_project import create_classification_project +from acousticbrainz.models.sklearn.model.predict import prediction + +cli = click.Group() + +@cli.command(name="classification_project") +@click.option("--ground-truth-file", "-g", + help="Path of the dataset's groundtruth file/s.", required=True) +@click.option("--low-level-dir", "-d", required=True, + help="Path of the main datasets dir containing .json file/s.") +@click.option("--project-file", "-f", + help="Name of the project configuration file (.yaml) will be stored. If " + "not specified it takes automatically the name .") +@click.option("--export-path", "-o", + help="Path where the project results will be stored. If empty, the results " + "will be saved in the main app directory.") +@click.option("--seed", "-s", type=int, default=None, + help="Seed is used to generate the random shuffled dataset applied " + "later to folding.") +@click.option("--jobs", "-j", default=-1, type=int, + help="Parallel jobs. Set to -1 to use all the available cores") +@click.option("--verbose", "-v", default=1, type=int, + help="Controls the verbosity: the higher, the more messages.") +@click.option("--logging", "-l", default="INFO", + type=click.Choice( + ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + case_sensitive=False + ), help="The logging level that will be printed") +def classification_project(ground_truth_file, low_level_dir, project_file, export_path, + seed, jobs, verbose, logging): + """ Generates a project configuration file given a filelist, a groundtruth file, + and the directories to store the datasets and the results files. The script has + a parameter to specify the project template to use. If it is not specified, it + will try to guess the appropriated one from the essentia version found on the + descriptor files. + """ + create_classification_project( + ground_truth_file=ground_truth_file, + dataset_dir=low_level_dir, + project_file=project_file, + exports_path=export_path, + seed=seed, + jobs=jobs, + verbose=verbose, + logging=logging + ) + + +@cli.command(name="predict") +@click.option("--project-file", "-f", required=True, + help="Name of the project configuration file (.yaml) that is to be loaded. " + "The .yaml at the end of the file is not necessary. Just put the name " + "of the file.") +@click.option("--export-path", "-o", + help="Path where the project results will be stored. If empty, the results " + "will be saved in the main app directory.") +@click.option("--track", "-t", required=True, + help="MBID of the the low-level data from the AcousticBrainz API.") +@click.option("--logging", "-l", default="INFO", + type=click.Choice( + ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + case_sensitive=False + ), help="The logging level that will be printed") +def predict(project_file, export_path, track, logging): + """ Prediction of a track. """ + prediction( + exports_path=export_path, + project_file=project_file, + mbid=track, + log_level=logging + ) + + +if __name__ == '__main__': + cli() From 581b713f9eebb4fe6d5bd46992188bc2cec12ae7 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 22:03:15 +0530 Subject: [PATCH 55/64] Configure hierarchical acousticbrainz.models logger --- acousticbrainz/models/__init__.py | 10 +++- .../classification/classification_task.py | 19 +++--- .../classification_task_manager.py | 27 ++++----- .../sklearn/classification/classifier_grid.py | 30 +++++----- .../sklearn/classification/evaluation.py | 52 +++++++---------- .../sklearn/classification/train_class.py | 7 +-- .../sklearn/helper_functions/logging_tool.py | 25 +++----- .../models/sklearn/model/predict.py | 3 +- .../transformation/load_ground_truth.py | 58 ++++++++++--------- .../sklearn/transformation/load_low_level.py | 31 +++++----- .../sklearn/transformation/transform.py | 47 ++++++++------- .../transformation/transform_predictions.py | 6 +- 12 files changed, 156 insertions(+), 159 deletions(-) diff --git a/acousticbrainz/models/__init__.py b/acousticbrainz/models/__init__.py index 40a96afc6..22518ae43 100644 --- a/acousticbrainz/models/__init__.py +++ b/acousticbrainz/models/__init__.py @@ -1 +1,9 @@ -# -*- coding: utf-8 -*- +import logging + +ACOUSTICBRAINZ_SKLEARN_LOGGER = "acousticbrainz.models" +_logger = logging.getLogger(ACOUSTICBRAINZ_SKLEARN_LOGGER) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') +_handler.setFormatter(_formatter) +_logger.addHandler(_handler) +_logger.setLevel(logging.INFO) diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py index 455230a92..fa42db0b1 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task.py +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -1,9 +1,13 @@ +import logging import os import json from ..classification.classifier_grid import TrainGridClassifier from ..classification.evaluation import evaluation +logger = logging.getLogger(__name__) + + class ClassificationTask: """ This class is the core of the model classification. It loads the relevant classifier to @@ -11,7 +15,7 @@ class ClassificationTask: to the configuration file declared class to train the model and then it uses that model for evaluation. """ - def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, logger): + def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks): """ Args: config: The configuration data that contain the settings from the configuration @@ -27,7 +31,6 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex y: The labels (NumPy array) of the target class exports_path: Path to where the classification project's results will be stored to. tracks: The tracks (numpy.ndarray) that are exported from the Groundtruth file. - log_level: The logging level (0-4). """ self.config = config self.classifier = classifier @@ -38,13 +41,12 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex self.training_processes = training_processes self.exports_path = exports_path self.tracks = tracks - self.logger = logger def run(self): # grid search train if self.config["train_kind"] == "grid": - self.logger.info("Train Classifier: Classifier with GridSearchCV") + logger.info("Train Classifier: Classifier with GridSearchCV") grid_svm_train = TrainGridClassifier(config=self.config, classifier=self.classifier, class_name=self.train_class, @@ -57,15 +59,15 @@ def run(self): grid_svm_train.train_grid_search_clf() grid_svm_train.export_best_classifier() else: - self.logger.error("Use a valid classifier in the configuration file.") - self.logger.info("Training the classifier is completed successfully.") + logger.error("Use a valid classifier in the configuration file.") + logger.info("Training the classifier is completed successfully.") # load best model to check its parameters - self.logger.debug("Loading the Best Model..") + logger.debug("Loading the Best Model..") best_model_name = "best_model_{}.json".format(self.train_class) with open(os.path.join(self.exports_path, best_model_name)) as best_model_file: best_model = json.load(best_model_file) - self.logger.debug("BEST MODEL: {}".format(best_model)) + logger.debug("BEST MODEL: {}".format(best_model)) # evaluation evaluation(config=self.config, @@ -75,5 +77,4 @@ def run(self): tracks=self.tracks, process=best_model["preprocessing"], exports_path=self.exports_path, - logger=self.logger ) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index d5c49ee98..473527368 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -1,3 +1,4 @@ +import logging import os from time import time from termcolor import colored @@ -7,6 +8,8 @@ from ..classification.classification_task import ClassificationTask +logger = logging.getLogger(__name__) + validClassifiers = ["svm", "NN"] validEvaluations = ["nfoldcrossvalidation"] @@ -20,7 +23,7 @@ class ClassificationTaskManager: with their corresponding preprocessing steps and parameters declaration for the classifier, and executes the classification task for each step. """ - def __init__(self, config, train_class, X, y, tracks, exports_path, logger): + def __init__(self, config, train_class, X, y, tracks, exports_path): """ Args: config: The configuration file name. @@ -34,7 +37,6 @@ def __init__(self, config, train_class, X, y, tracks, exports_path, logger): self.y = y self.tracks = tracks self.exports_path = exports_path - self.logger = logger self.results_path = "" self.logs_path = "" @@ -72,25 +74,25 @@ def config_file_analysis(self): """ Check the keys of the configuration template file if they are set up correctly. """ - self.logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") + logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") if "processing" not in self.config: - self.logger.error("No preprocessing defined in config.") + logger.error("No preprocessing defined in config.") if "evaluations" not in self.config: - self.logger.error("No evaluations defined in config.") - self.logger.error("Setting default evaluation to 10-fold cross-validation") + logger.error("No evaluations defined in config.") + logger.error("Setting default evaluation to 10-fold cross-validation") self.config["evaluations"] = {"nfoldcrossvalidation": [{"nfold": [10]}]} for classifier in self.config['classifiers'].keys(): if classifier not in validClassifiers: - self.logger.error("Not a valid classifier: {}".format(classifier)) + logger.error("Not a valid classifier: {}".format(classifier)) raise ValueError("The classifier name must be valid.") for evaluation in self.config['evaluations'].keys(): if evaluation not in validEvaluations: - self.logger.error("Not a valid evaluation: {}".format(evaluation)) + logger.error("Not a valid evaluation: {}".format(evaluation)) raise ValueError("The evaluation must be valid.") - self.logger.info("No errors in config file format found.") + logger.info("No errors in config file format found.") def apply_processing(self): """ @@ -98,7 +100,7 @@ def apply_processing(self): """ start_time = time() training_processes = TrainingProcesses(self.config).training_processes() - self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) + logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) for classifier in self.config["classifiers"].keys(): print("Before Classification task: ", classifier) task = ClassificationTask(config=self.config, @@ -109,18 +111,17 @@ def apply_processing(self): y=self.y, exports_path=self.exports_path, tracks=self.tracks, - logger=self.logger ) try: task.run() except Exception as e: - self.logger.error('Running task failed: {}'.format(e)) + logger.error('Running task failed: {}'.format(e)) print(colored('Running task failed: {}'.format(e), "red")) end_time = time() print() print(colored("Last evaluation took place at: {}".format(datetime.now()), "magenta")) - self.logger.info("Last evaluation took place at: {}".format(datetime.now())) + logger.info("Last evaluation took place at: {}".format(datetime.now())) # test duration time_duration = end_time - start_time diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py index 5c36ded43..553e0189a 100644 --- a/acousticbrainz/models/sklearn/classification/classifier_grid.py +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -1,3 +1,4 @@ +import logging import os import json from termcolor import colored @@ -9,8 +10,11 @@ from ..transformation.transform import Transform +logger = logging.getLogger(__name__) + + class TrainGridClassifier: - def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, logger): + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path): self.config = config self.classifier = classifier self.class_name = class_name @@ -19,7 +23,6 @@ def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_p self.tr_processes = tr_processes self.exports_path = exports_path - self.logger = logger self.best_models_list = [] # self.train_grid_search_clf() @@ -28,7 +31,7 @@ def train_grid_search_clf(self): process_counter = 1 for tr_process in self.tr_processes: print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) - self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) + logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) # initiate SVM classifier object if self.classifier == "svm": grid_clf = SVC(gamma="auto", probability=True) @@ -42,16 +45,14 @@ def train_grid_search_clf(self): df_feats=self.X, process=tr_process["preprocess"], train_class=self.class_name, - exports_path=self.exports_path, - logger=self.logger).post_processing() + exports_path=self.exports_path).post_processing() # train the grid classifier and return the trained model gsvc = train_grid(tr_process=tr_process, grid_clf=grid_clf, features_prepared=features_prepared, y=self.y, - config=self.config, - logger=self.logger) + config=self.config) # save best results for each train process # paths declaration for saving the grid training results @@ -65,8 +66,7 @@ def train_grid_search_clf(self): class_name=self.class_name, tr_process=tr_process, results_path=results_path, - best_process_model_path=best_process_model_path, - logger=self.logger) + best_process_model_path=best_process_model_path) # return a list that includes the best models exported from each processing self.best_models_list.append(results_dict) @@ -81,21 +81,21 @@ def train_grid_search_clf(self): def export_best_classifier(self): # Gather the best scores from the exported grid clf models scores = [x["score"] for x in self.best_models_list] - self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) + logger.info("This is the max score of all the training processes: {}".format(max(scores))) for model in self.best_models_list: if model["score"] == max(scores): - self.logger.info("Best {} model parameters:".format(self.class_name)) + logger.info("Best {} model parameters:".format(self.class_name)) # log2 --> convert values to initial parameters' values # model["params"]["C"] = math.log2(model["params"]["C"]) # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) - self.logger.info("{}".format(model)) + logger.info("{}".format(model)) best_model_name = "best_model_{}.json".format(self.class_name) with open(os.path.join(self.exports_path, best_model_name), "w") as best_model: json.dump(model, best_model, indent=4) - self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) + logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) -def train_grid(tr_process, grid_clf, features_prepared, y, config, logger): +def train_grid(tr_process, grid_clf, features_prepared, y, config): # define the length of parameters parameters_grid = {'kernel': tr_process["kernel"], 'C': tr_process["C"], @@ -136,7 +136,7 @@ def train_grid(tr_process, grid_clf, features_prepared, y, config, logger): return gsvc -def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path, logger): +def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path): results_best_dict_name = "result_{}_{}_best_{}.json" \ .format(class_name, tr_process["preprocess"], gsvc.best_score_) diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py index a26f5bf88..a9a2820dd 100644 --- a/acousticbrainz/models/sklearn/classification/evaluation.py +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -1,3 +1,4 @@ +import logging import os import json import numpy as np @@ -16,7 +17,10 @@ from ..classification.matrix_creation import matrix_creation, simplified_matrix_export -def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger): +logger = logging.getLogger(__name__) + + +def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): print(colored("------ EVALUATION and FOLDING ------", "yellow")) logger.info("---- Folded evaluation of the model in the dataset ----") @@ -58,8 +62,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, df_feats=X, process=process, train_class=class_name, - exports_path=exports_path, - logger=logger).post_processing() + exports_path=exports_path).post_processing() logger.debug("Features prepared shape: {}".format(features_prepared.shape)) # Starting Training, Predictions for each fold @@ -69,14 +72,12 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, feats_prepared=features_prepared, y=y, tracks=tracks, - class_name=class_name, - logger=logger) + class_name=class_name) # concatenate the folded predictions DFs df_predictions = create_dataset_predictions(list_df_predictions=predictions_df_list, class_name=class_name, - dataset_path=dataset_path, - logger=logger) + dataset_path=dataset_path) logger.debug("PRINT THE WHOLE GESTURES DF:\n{}".format(df_predictions)) @@ -99,19 +100,16 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, config=config, class_name=class_name, exports_path=exports_path, - images_path=images_path, - logger=logger) + images_path=images_path) # Folded Tracks Dictionary --> export also the Folded instances dictionary folded_instances_dict = export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, class_name=class_name, - dataset_path=dataset_path, - logger=logger) + dataset_path=dataset_path) concat_save_model_instances_matrix_json(instances_dict=folded_instances_dict, cm_dict=folded_matrix_dict, exports_path=exports_path, - logger=logger, export_name="folded_dataset_instances_cm.json") simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", @@ -129,7 +127,6 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, predictions=df_predictions["predictions"], class_name=class_name, exports_path=exports_path, - logger=logger ) # ---------- TRAIN TO THE WHOLE DATASET WITH THE BEST CLASSIFIER ---------- @@ -162,7 +159,6 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, concat_save_model_instances_matrix_json(instances_dict=None, cm_dict=whole_matrix_dict, exports_path=exports_path, - logger=logger, export_name="whole_dataset_instances_cm.json") # Evaluation to the whole Dataset @@ -172,11 +168,10 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, predictions=predictions_all, class_name=class_name, exports_path=exports_path, - logger=logger ) -def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, logger, export_name): +def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, export_name): """ Save the best model's folded instances and confusion matrix dictionary merged into one dictionary @@ -184,7 +179,6 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat instances_dict: cm_dict: exports_path: - logger: export_name: Returns: @@ -206,7 +200,7 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat logger.info("Whole folded instaces and matrix dictionary stored successfully.") -def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logger): +def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name): """ Args: @@ -216,7 +210,6 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logge y: the true values tracks: class_name: - logger: Returns: tracks_fold_indexing_dict: @@ -252,8 +245,7 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logge X_test=X_test, test_index=test_index, tracks_list=tracks_list, - y_test=y_test, - logger=logger) + y_test=y_test) # Append the folded dataset to a list that will contain all the folded datasets predictions_df_list.append(df_pred_general) # Append each accuracy of the folded model to a list that contains all the accuracies resulted from each fold @@ -263,7 +255,7 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logge return predictions_df_list, accuracy_model, tracks_fold_indexing_dict -def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): +def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test): """ Creates a pandas DataFrame from each fold with the predictions in order later to extract the shuffled dataset with the tracks, the percentage @@ -277,7 +269,6 @@ def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_ test_index: tracks_list: y_test: - logger: Returns: A pandas DataFrame with the predictions at each fold. @@ -303,7 +294,7 @@ def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_ return df_pred_general -def export_accuracies(accuracy_model, config, class_name, exports_path, images_path, logger): +def export_accuracies(accuracy_model, config, class_name, exports_path, images_path): """ Args: @@ -312,7 +303,6 @@ def export_accuracies(accuracy_model, config, class_name, exports_path, images_p class_name: exports_path: images_path: - logger: Returns: @@ -331,17 +321,15 @@ def export_accuracies(accuracy_model, config, class_name, exports_path, images_p # Visualize accuracy for each iteration in a distribution plot create_accuracies_dist_plot(accuracies_list=accuracy_model, - images_path=images_path, - logger=logger) + images_path=images_path) -def create_dataset_predictions(list_df_predictions, class_name, dataset_path, logger): +def create_dataset_predictions(list_df_predictions, class_name, dataset_path): """ Args: list_df_predictions: class_name: dataset_path: - logger: Returns: @@ -358,7 +346,7 @@ def create_dataset_predictions(list_df_predictions, class_name, dataset_path, lo return df_concat_predictions -def create_accuracies_dist_plot(accuracies_list, images_path, logger): +def create_accuracies_dist_plot(accuracies_list, images_path): logger.info("Visualize accuracy for each iteration.") list_folds = [] counter_folds = 0 @@ -375,7 +363,7 @@ def create_accuracies_dist_plot(accuracies_list, images_path, logger): logger.info("Plot saved successfully.") -def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, logger): +def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path): logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) fold_dict = {"fold": tracks_fold_indexing_dict} @@ -397,7 +385,7 @@ def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, return fold_dict -def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path, logger): +def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path): logger.info("---- Evaluation to the {} dataset ----".format(set_name)) # Confusion Matrix logger.info("Exporting Confusion Matrix applied to the {} dataset..".format(set_name)) diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index 8e92a0f1f..2be0d6c04 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -25,8 +25,7 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l logger = setup_logger( exports_path=exports_path, - name="train_model_{}".format(class_name), - mode="w", + file_name="train_model_{}".format(class_name), level=log_level ) @@ -52,7 +51,6 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l tracks_list=tracks_listed_shuffled, train_class=class_name, exports_path=exports_path, - logger=logger ).create_df_tracks() logger.debug("Types of exported files from GT:") logger.debug("Type of features: {}".format(type(features))) @@ -64,8 +62,7 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l X=features, y=labels, tracks=tracks, - exports_path=exports_path, - logger=logger) + exports_path=exports_path) classification_time = model_manage.apply_processing() print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) logger.info("Classification ended successfully in {} minutes.".format(classification_time)) diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index 0c764de0e..33b46b3df 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -8,45 +8,36 @@ import logging import os +from acousticbrainz.models import ACOUSTICBRAINZ_SKLEARN_LOGGER from acousticbrainz.models.sklearn.helper_functions.utils import create_directory -def setup_logger(exports_path, name, mode, level=logging.INFO): +def setup_logger(exports_path, file_name, mode="w", level=logging.INFO): """ Function to set up as many loggers as you want. It exports the logging results to a file in the relevant path that is determined by the configuration file. Args: exports_path: The path (str) the logging exports will be exported. - name: The name (str) of the logger. + file_name: The name (str) of the logger. level: The level (int) of the logging. Defaults to logging.INFO. mode: The mode (str) translated in write, append. Valid values ("w", "a") Returns: The logger object. """ + logger = logging.getLogger(ACOUSTICBRAINZ_SKLEARN_LOGGER) logs_path = create_directory(exports_path, "logs") - # Create a custom logger - logger = logging.getLogger(name) - - # Create handlers - c_handler = logging.StreamHandler() - f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(name)), mode=mode) - # Create formatters and add it to handlers - c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') + f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(file_name)), mode=mode) f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - c_handler.setFormatter(c_format) f_handler.setFormatter(f_format) - # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient - # when debugging and the code includes the logger initialization - if logger.hasHandlers(): - logger.handlers.clear() + # remove existing file handlers if any + logger.handlers = [handler for handler in logger.handlers if not isinstance(handler, logging.FileHandler)] - # Add handlers to the logger - logger.addHandler(c_handler) + # Add current file handler to the logger logger.addHandler(f_handler) if level is None: diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index 0e9d568fe..a23473f64 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -43,8 +43,7 @@ def load_best_model(self): def preprocessing(self): self.logger = setup_logger( exports_path=self.exports_path, - name="predict_{}".format(self.class_name), - mode="w", + file_name="predict_{}".format(self.class_name), level=self.log_level ) diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 78922830f..76de2f595 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -1,3 +1,4 @@ +import logging import os import yaml import pandas as pd @@ -8,6 +9,9 @@ from ..transformation.load_low_level import FeaturesDf +logger = logging.getLogger(__name__) + + class GroundTruthLoad: """ The Ground Truth data which contains the tracks and the corresponding @@ -27,7 +31,7 @@ def __init__(self, config, gt_filename, exports_path, log_level): self.exports_path = exports_path self.log_level = log_level - self.logger = "" + logger = "" self.class_dir = "" self.ground_truth_data = {} self.labeled_tracks = {} @@ -131,12 +135,11 @@ class DatasetExporter: """ TODO: Description """ - def __init__(self, config, tracks_list, train_class, exports_path, logger): + def __init__(self, config, tracks_list, train_class, exports_path): self.config = config self.tracks_list = tracks_list self.train_class = train_class self.exports_path = exports_path - self.logger = logger self.dataset_dir = "" self.class_dir = "" @@ -152,7 +155,7 @@ def create_df_tracks(self): TODO: Description """ - self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") + logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") self.dataset_dir = self.config.get("dataset_dir") print('DATASET-DIR', self.dataset_dir) dirpath = os.path.join(os.getcwd(), self.dataset_dir) @@ -160,47 +163,47 @@ def create_df_tracks(self): for (dirpath, dirnames, filenames) in os.walk(dirpath): low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] if len(low_level_list) != 0: - self.logger.info("Low-level features for the tracks found.") + logger.info("Low-level features for the tracks found.") # processing the names of the tracks that are inside both the GT file and the low-level json files # list with the tracks that are included in the low-level json files tracks_existing_list = [e for e in self.tracks_list for i in low_level_list if e[0] in i] # list with the low-level json tracks' paths that are included in tracks list tracks_existing_path_list = [i for e in self.tracks_list for i in low_level_list if e[0] in i] - self.logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) - self.logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) - self.logger.debug("{}".format(tracks_existing_list[:4])) - self.logger.debug("{}".format(tracks_existing_path_list[:4])) - self.logger.debug("The founded tracks tracks listed successfully.") - self.logger.debug("Generate random number within a given range of listed tracks:") + logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) + logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) + logger.debug("{}".format(tracks_existing_list[:4])) + logger.debug("{}".format(tracks_existing_path_list[:4])) + logger.debug("The founded tracks tracks listed successfully.") + logger.debug("Generate random number within a given range of listed tracks:") # Random number between 0 and length of listed tracks random_num = random.randrange(len(tracks_existing_list)) - self.logger.debug("Check if the tracks are the same in the same random index in both lists") - self.logger.debug("{}".format(tracks_existing_list[random_num])) - self.logger.debug("{}".format(tracks_existing_path_list[random_num])) + logger.debug("Check if the tracks are the same in the same random index in both lists") + logger.debug("{}".format(tracks_existing_list[random_num])) + logger.debug("{}".format(tracks_existing_path_list[random_num])) self.tracks_list = tracks_existing_list # create the dataframe with tracks that are bothe in low-level files and the GT file self.df_tracks = pd.DataFrame(data=self.tracks_list, columns=["track", self.train_class]) - self.logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) - self.logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") - self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) + logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") + logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - self.logger.debug("Drop rows with NULL values if they exist..") + logger.debug("Drop rows with NULL values if they exist..") if self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape[0] != 0: self.df_tracks.dropna(inplace=True) - self.logger.debug("Check if there are NULL values after the cleaning process:") - self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - self.logger.debug("Re-index the tracks DF..") + logger.debug("Check if there are NULL values after the cleaning process:") + logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + logger.debug("Re-index the tracks DF..") self.df_tracks = self.df_tracks.reset_index(drop=True) else: - self.logger.info("There are no NULL values found.") + logger.info("There are no NULL values found.") # export shuffled tracks to CSV format tracks_path = create_directory(self.exports_path, "tracks_csv_format") self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) - self.logger.debug("DF INFO:") - self.logger.debug("{}".format(self.df_tracks.info())) - self.logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( + logger.debug("DF INFO:") + logger.debug("{}".format(self.df_tracks.info())) + logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( self.df_tracks.select_dtypes(include=['object']).columns)) self.df_feats = FeaturesDf(df_tracks=self.df_tracks, @@ -208,12 +211,11 @@ def create_df_tracks(self): list_path_tracks=tracks_existing_path_list, config=self.config, exports_path=self.exports_path, - logger=self.logger, ).create_low_level_df() self.y = self.df_tracks[self.train_class].values - self.logger.info("Features, Labels, and Tracks are exported successfully..") + logger.info("Features, Labels, and Tracks are exported successfully..") return self.df_feats, self.y, self.df_tracks["track"].values else: - self.logger.error("No low-level data found.") + logger.error("No low-level data found.") return None, None, None diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py index 51f319db8..92ac4ca66 100644 --- a/acousticbrainz/models/sklearn/transformation/load_low_level.py +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -1,9 +1,13 @@ +import logging import os import json import pandas as pd from ..transformation.utils_preprocessing import flatten_dict_full +logger = logging.getLogger(__name__) + + class FeaturesDf: """ Features DataFrame object by the JSON low-level data. @@ -11,13 +15,12 @@ class FeaturesDf: df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, label, etc. """ - def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, logger): + def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path): self.df_tracks = df_tracks self.train_class = train_class self.list_path_tracks = list_path_tracks self.config = config self.exports_path = exports_path - self.logger = logger self.list_feats_tracks = [] self.counter_items_transformed = 0 self.df_feats_tracks = pd.DataFrame() @@ -31,7 +34,7 @@ def create_low_level_df(self): Returns: The low-level features (pandas DataFrame) from all the tracks in the collection. """ - self.logger.info("---- CREATE LOW LEVEL DATAFRAME ----") + logger.info("---- CREATE LOW LEVEL DATAFRAME ----") # clear the list if it not empty self.list_feats_tracks.clear() for track_low_level_path in self.list_path_tracks: @@ -40,7 +43,7 @@ def create_low_level_df(self): data_feats_item = json.load(f, strict=False) except Exception as e: print("Exception occurred in loading file:", e) - self.logger.warning("Exception occurred in loading file: {}".format(e)) + logger.warning("Exception occurred in loading file: {}".format(e)) # remove unnecessary features data try: if 'beats_position' in data_feats_item['rhythm']: @@ -58,20 +61,20 @@ def create_low_level_df(self): # The dictionary's keys list is transformed to type self.df_feats_tracks = pd.DataFrame(self.list_feats_tracks, columns=list(self.list_feats_tracks[0].keys())) - self.logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( + logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( self.df_feats_tracks.select_dtypes(include=['object']).columns)) - self.logger.info("Exporting low-level data (DataFrame)..") + logger.info("Exporting low-level data (DataFrame)..") return self.df_feats_tracks def check_processing_info(self): """ Prints some information about the low-level data to DataFrame transformation step and its middle processes. """ - self.logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) + logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) # The type of the dictionary's keys list is: - self.logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) + logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) # The dictionary's keys list is transformed to type - self.logger.info('Confirm the type of list transformation of features keys: {}' + logger.info('Confirm the type of list transformation of features keys: {}' .format(type(list(self.list_feats_tracks[0].keys())))) def export_tracks_feats_df(self): @@ -80,12 +83,12 @@ def export_tracks_feats_df(self): The tracks (pandas DataFrame) with all the ground truth data and the corresponding low-level data flattened. """ - self.logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") - self.logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) - self.logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) + logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") + logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) + logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) self.df_feats_label = pd.concat([self.df_tracks, self.df_feats_tracks], axis=1) - self.logger.info("FULL: {}".format(self.df_feats_label.shape)) - self.logger.info("COLUMNS CONTAIN OBJECTS: {}" + logger.info("FULL: {}".format(self.df_feats_label.shape)) + logger.info("COLUMNS CONTAIN OBJECTS: {}" .format(self.df_feats_label.select_dtypes(include=['object']).columns)) return self.df_feats_label diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py index 2396bcd97..d8e06524e 100644 --- a/acousticbrainz/models/sklearn/transformation/transform.py +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -1,3 +1,5 @@ +import logging + import pandas as pd from termcolor import colored import collections @@ -19,15 +21,16 @@ except AttributeError: collectionsAbc = collections +logger = logging.getLogger(__name__) + class Transform: - def __init__(self, config, df_feats, process, train_class, exports_path, logger): + def __init__(self, config, df_feats, process, train_class, exports_path): self.config = config self.df_feats = df_feats self.process = process self.train_class = train_class self.exports_path = exports_path - self.logger = logger self.list_features = [] self.feats_cat_list = [] @@ -40,8 +43,8 @@ def __init__(self, config, df_feats, process, train_class, exports_path, logger) def post_processing(self): print(colored("PROCESS: {}".format(self.process), "cyan")) - self.logger.debug("PROCESS: {}".format(self.process)) - self.logger.debug("Process: {}".format(self.config["processing"][self.process])) + logger.debug("PROCESS: {}".format(self.process)) + logger.debug("Process: {}".format(self.config["processing"][self.process])) # list_preprocesses = [] self.list_features = list(self.df_feats.columns) @@ -50,34 +53,34 @@ def post_processing(self): # clean list print(colored("Cleaning..", "yellow")) - self.logger.info("Cleaning..") + logger.info("Cleaning..") cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] - self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) + logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) # remove list print(colored("Removing unnecessary features..", "yellow")) - self.logger.info("Removing unnecessary features..") + logger.info("Removing unnecessary features..") if self.config["processing"][self.process][0]["transfo"] == "remove": remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) self.list_features = [x for x in self.list_features if x not in feats_remove_list] - self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) + logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) # enumerate list print(colored("Split numerical / categorical features..", "yellow")) if self.config["processing"][self.process][1]["transfo"] == "enumerate": enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) - self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] - self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) - self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) # BASIC if self.process == "basic": - self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -102,7 +105,7 @@ def post_processing(self): if self.process == "lowlevel" or self.process == "mfcc": sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) - self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -128,7 +131,7 @@ def post_processing(self): sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) feats_rem_list = feats_selector_list(self.df_feats, sel_list) self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] - self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -151,7 +154,7 @@ def post_processing(self): # NORMALIZED if self.process == "normalized": - self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)), ('minmax_scaler', MinMaxScaler()), @@ -178,9 +181,9 @@ def post_processing(self): feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] - self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) - self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) - self.logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) + logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) num_norm_pipeline = Pipeline([ ("selector_num", DataFrameSelector(self.feats_num_list)), @@ -198,7 +201,7 @@ def post_processing(self): ]) self.feats_prepared = full_normalize_pipeline.fit_transform(self.df_feats) - self.logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) + logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) # save pipeline joblib.dump(full_normalize_pipeline, os.path.join(models_path, "full_normalize_pipeline_{}.pkl".format(self.process))) @@ -211,9 +214,9 @@ def post_processing(self): print(select_no_rename_list) new_feats_columns = select_rename_list + select_no_rename_list self.df_feats.columns = new_feats_columns - self.logger.debug("Normalized Features DF:") - self.logger.debug("\n{}".format(self.df_feats)) - self.logger.debug("Shape: {}".format(self.df_feats.shape)) + logger.debug("Normalized Features DF:") + logger.debug("\n{}".format(self.df_feats)) + logger.debug("Shape: {}".format(self.df_feats.shape)) feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index c14402fca..3f4108218 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -1,3 +1,5 @@ +import logging + import pandas as pd from termcolor import colored import collections @@ -15,9 +17,11 @@ except AttributeError: collectionsAbc = collections +logger = logging.getLogger(__name__) + class TransformPredictions: - def __init__(self, config, df_feats, process, train_class, exports_path, logger): + def __init__(self, config, df_feats, process, train_class, exports_path): self.config = config self.df_feats = df_feats self.process = process From e62152955249015764dcc3528073960c2b5ba306 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 22:20:19 +0530 Subject: [PATCH 56/64] Revert "Configure hierarchical acousticbrainz.models logger" This reverts commit 581b713f9eebb4fe6d5bd46992188bc2cec12ae7. --- acousticbrainz/models/__init__.py | 10 +--- .../classification/classification_task.py | 19 +++--- .../classification_task_manager.py | 27 +++++---- .../sklearn/classification/classifier_grid.py | 30 +++++----- .../sklearn/classification/evaluation.py | 52 ++++++++++------- .../sklearn/classification/train_class.py | 7 ++- .../sklearn/helper_functions/logging_tool.py | 25 +++++--- .../models/sklearn/model/predict.py | 3 +- .../transformation/load_ground_truth.py | 58 +++++++++---------- .../sklearn/transformation/load_low_level.py | 31 +++++----- .../sklearn/transformation/transform.py | 47 +++++++-------- .../transformation/transform_predictions.py | 6 +- 12 files changed, 159 insertions(+), 156 deletions(-) diff --git a/acousticbrainz/models/__init__.py b/acousticbrainz/models/__init__.py index 22518ae43..40a96afc6 100644 --- a/acousticbrainz/models/__init__.py +++ b/acousticbrainz/models/__init__.py @@ -1,9 +1 @@ -import logging - -ACOUSTICBRAINZ_SKLEARN_LOGGER = "acousticbrainz.models" -_logger = logging.getLogger(ACOUSTICBRAINZ_SKLEARN_LOGGER) -_handler = logging.StreamHandler() -_formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') -_handler.setFormatter(_formatter) -_logger.addHandler(_handler) -_logger.setLevel(logging.INFO) +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py index fa42db0b1..455230a92 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task.py +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -1,13 +1,9 @@ -import logging import os import json from ..classification.classifier_grid import TrainGridClassifier from ..classification.evaluation import evaluation -logger = logging.getLogger(__name__) - - class ClassificationTask: """ This class is the core of the model classification. It loads the relevant classifier to @@ -15,7 +11,7 @@ class ClassificationTask: to the configuration file declared class to train the model and then it uses that model for evaluation. """ - def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks): + def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, logger): """ Args: config: The configuration data that contain the settings from the configuration @@ -31,6 +27,7 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex y: The labels (NumPy array) of the target class exports_path: Path to where the classification project's results will be stored to. tracks: The tracks (numpy.ndarray) that are exported from the Groundtruth file. + log_level: The logging level (0-4). """ self.config = config self.classifier = classifier @@ -41,12 +38,13 @@ def __init__(self, config, classifier, train_class, training_processes, X, y, ex self.training_processes = training_processes self.exports_path = exports_path self.tracks = tracks + self.logger = logger def run(self): # grid search train if self.config["train_kind"] == "grid": - logger.info("Train Classifier: Classifier with GridSearchCV") + self.logger.info("Train Classifier: Classifier with GridSearchCV") grid_svm_train = TrainGridClassifier(config=self.config, classifier=self.classifier, class_name=self.train_class, @@ -59,15 +57,15 @@ def run(self): grid_svm_train.train_grid_search_clf() grid_svm_train.export_best_classifier() else: - logger.error("Use a valid classifier in the configuration file.") - logger.info("Training the classifier is completed successfully.") + self.logger.error("Use a valid classifier in the configuration file.") + self.logger.info("Training the classifier is completed successfully.") # load best model to check its parameters - logger.debug("Loading the Best Model..") + self.logger.debug("Loading the Best Model..") best_model_name = "best_model_{}.json".format(self.train_class) with open(os.path.join(self.exports_path, best_model_name)) as best_model_file: best_model = json.load(best_model_file) - logger.debug("BEST MODEL: {}".format(best_model)) + self.logger.debug("BEST MODEL: {}".format(best_model)) # evaluation evaluation(config=self.config, @@ -77,4 +75,5 @@ def run(self): tracks=self.tracks, process=best_model["preprocessing"], exports_path=self.exports_path, + logger=self.logger ) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index 473527368..d5c49ee98 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -1,4 +1,3 @@ -import logging import os from time import time from termcolor import colored @@ -8,8 +7,6 @@ from ..classification.classification_task import ClassificationTask -logger = logging.getLogger(__name__) - validClassifiers = ["svm", "NN"] validEvaluations = ["nfoldcrossvalidation"] @@ -23,7 +20,7 @@ class ClassificationTaskManager: with their corresponding preprocessing steps and parameters declaration for the classifier, and executes the classification task for each step. """ - def __init__(self, config, train_class, X, y, tracks, exports_path): + def __init__(self, config, train_class, X, y, tracks, exports_path, logger): """ Args: config: The configuration file name. @@ -37,6 +34,7 @@ def __init__(self, config, train_class, X, y, tracks, exports_path): self.y = y self.tracks = tracks self.exports_path = exports_path + self.logger = logger self.results_path = "" self.logs_path = "" @@ -74,25 +72,25 @@ def config_file_analysis(self): """ Check the keys of the configuration template file if they are set up correctly. """ - logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") + self.logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") if "processing" not in self.config: - logger.error("No preprocessing defined in config.") + self.logger.error("No preprocessing defined in config.") if "evaluations" not in self.config: - logger.error("No evaluations defined in config.") - logger.error("Setting default evaluation to 10-fold cross-validation") + self.logger.error("No evaluations defined in config.") + self.logger.error("Setting default evaluation to 10-fold cross-validation") self.config["evaluations"] = {"nfoldcrossvalidation": [{"nfold": [10]}]} for classifier in self.config['classifiers'].keys(): if classifier not in validClassifiers: - logger.error("Not a valid classifier: {}".format(classifier)) + self.logger.error("Not a valid classifier: {}".format(classifier)) raise ValueError("The classifier name must be valid.") for evaluation in self.config['evaluations'].keys(): if evaluation not in validEvaluations: - logger.error("Not a valid evaluation: {}".format(evaluation)) + self.logger.error("Not a valid evaluation: {}".format(evaluation)) raise ValueError("The evaluation must be valid.") - logger.info("No errors in config file format found.") + self.logger.info("No errors in config file format found.") def apply_processing(self): """ @@ -100,7 +98,7 @@ def apply_processing(self): """ start_time = time() training_processes = TrainingProcesses(self.config).training_processes() - logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) + self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) for classifier in self.config["classifiers"].keys(): print("Before Classification task: ", classifier) task = ClassificationTask(config=self.config, @@ -111,17 +109,18 @@ def apply_processing(self): y=self.y, exports_path=self.exports_path, tracks=self.tracks, + logger=self.logger ) try: task.run() except Exception as e: - logger.error('Running task failed: {}'.format(e)) + self.logger.error('Running task failed: {}'.format(e)) print(colored('Running task failed: {}'.format(e), "red")) end_time = time() print() print(colored("Last evaluation took place at: {}".format(datetime.now()), "magenta")) - logger.info("Last evaluation took place at: {}".format(datetime.now())) + self.logger.info("Last evaluation took place at: {}".format(datetime.now())) # test duration time_duration = end_time - start_time diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py index 553e0189a..5c36ded43 100644 --- a/acousticbrainz/models/sklearn/classification/classifier_grid.py +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -1,4 +1,3 @@ -import logging import os import json from termcolor import colored @@ -10,11 +9,8 @@ from ..transformation.transform import Transform -logger = logging.getLogger(__name__) - - class TrainGridClassifier: - def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path): + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, logger): self.config = config self.classifier = classifier self.class_name = class_name @@ -23,6 +19,7 @@ def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_p self.tr_processes = tr_processes self.exports_path = exports_path + self.logger = logger self.best_models_list = [] # self.train_grid_search_clf() @@ -31,7 +28,7 @@ def train_grid_search_clf(self): process_counter = 1 for tr_process in self.tr_processes: print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) - logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) + self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) # initiate SVM classifier object if self.classifier == "svm": grid_clf = SVC(gamma="auto", probability=True) @@ -45,14 +42,16 @@ def train_grid_search_clf(self): df_feats=self.X, process=tr_process["preprocess"], train_class=self.class_name, - exports_path=self.exports_path).post_processing() + exports_path=self.exports_path, + logger=self.logger).post_processing() # train the grid classifier and return the trained model gsvc = train_grid(tr_process=tr_process, grid_clf=grid_clf, features_prepared=features_prepared, y=self.y, - config=self.config) + config=self.config, + logger=self.logger) # save best results for each train process # paths declaration for saving the grid training results @@ -66,7 +65,8 @@ def train_grid_search_clf(self): class_name=self.class_name, tr_process=tr_process, results_path=results_path, - best_process_model_path=best_process_model_path) + best_process_model_path=best_process_model_path, + logger=self.logger) # return a list that includes the best models exported from each processing self.best_models_list.append(results_dict) @@ -81,21 +81,21 @@ def train_grid_search_clf(self): def export_best_classifier(self): # Gather the best scores from the exported grid clf models scores = [x["score"] for x in self.best_models_list] - logger.info("This is the max score of all the training processes: {}".format(max(scores))) + self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) for model in self.best_models_list: if model["score"] == max(scores): - logger.info("Best {} model parameters:".format(self.class_name)) + self.logger.info("Best {} model parameters:".format(self.class_name)) # log2 --> convert values to initial parameters' values # model["params"]["C"] = math.log2(model["params"]["C"]) # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) - logger.info("{}".format(model)) + self.logger.info("{}".format(model)) best_model_name = "best_model_{}.json".format(self.class_name) with open(os.path.join(self.exports_path, best_model_name), "w") as best_model: json.dump(model, best_model, indent=4) - logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) + self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) -def train_grid(tr_process, grid_clf, features_prepared, y, config): +def train_grid(tr_process, grid_clf, features_prepared, y, config, logger): # define the length of parameters parameters_grid = {'kernel': tr_process["kernel"], 'C': tr_process["C"], @@ -136,7 +136,7 @@ def train_grid(tr_process, grid_clf, features_prepared, y, config): return gsvc -def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path): +def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path, logger): results_best_dict_name = "result_{}_{}_best_{}.json" \ .format(class_name, tr_process["preprocess"], gsvc.best_score_) diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py index a9a2820dd..a26f5bf88 100644 --- a/acousticbrainz/models/sklearn/classification/evaluation.py +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -1,4 +1,3 @@ -import logging import os import json import numpy as np @@ -17,10 +16,7 @@ from ..classification.matrix_creation import matrix_creation, simplified_matrix_export -logger = logging.getLogger(__name__) - - -def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): +def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger): print(colored("------ EVALUATION and FOLDING ------", "yellow")) logger.info("---- Folded evaluation of the model in the dataset ----") @@ -62,7 +58,8 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): df_feats=X, process=process, train_class=class_name, - exports_path=exports_path).post_processing() + exports_path=exports_path, + logger=logger).post_processing() logger.debug("Features prepared shape: {}".format(features_prepared.shape)) # Starting Training, Predictions for each fold @@ -72,12 +69,14 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): feats_prepared=features_prepared, y=y, tracks=tracks, - class_name=class_name) + class_name=class_name, + logger=logger) # concatenate the folded predictions DFs df_predictions = create_dataset_predictions(list_df_predictions=predictions_df_list, class_name=class_name, - dataset_path=dataset_path) + dataset_path=dataset_path, + logger=logger) logger.debug("PRINT THE WHOLE GESTURES DF:\n{}".format(df_predictions)) @@ -100,16 +99,19 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): config=config, class_name=class_name, exports_path=exports_path, - images_path=images_path) + images_path=images_path, + logger=logger) # Folded Tracks Dictionary --> export also the Folded instances dictionary folded_instances_dict = export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, class_name=class_name, - dataset_path=dataset_path) + dataset_path=dataset_path, + logger=logger) concat_save_model_instances_matrix_json(instances_dict=folded_instances_dict, cm_dict=folded_matrix_dict, exports_path=exports_path, + logger=logger, export_name="folded_dataset_instances_cm.json") simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", @@ -127,6 +129,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): predictions=df_predictions["predictions"], class_name=class_name, exports_path=exports_path, + logger=logger ) # ---------- TRAIN TO THE WHOLE DATASET WITH THE BEST CLASSIFIER ---------- @@ -159,6 +162,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): concat_save_model_instances_matrix_json(instances_dict=None, cm_dict=whole_matrix_dict, exports_path=exports_path, + logger=logger, export_name="whole_dataset_instances_cm.json") # Evaluation to the whole Dataset @@ -168,10 +172,11 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path): predictions=predictions_all, class_name=class_name, exports_path=exports_path, + logger=logger ) -def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, export_name): +def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, logger, export_name): """ Save the best model's folded instances and confusion matrix dictionary merged into one dictionary @@ -179,6 +184,7 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat instances_dict: cm_dict: exports_path: + logger: export_name: Returns: @@ -200,7 +206,7 @@ def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_pat logger.info("Whole folded instaces and matrix dictionary stored successfully.") -def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name): +def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logger): """ Args: @@ -210,6 +216,7 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name): y: the true values tracks: class_name: + logger: Returns: tracks_fold_indexing_dict: @@ -245,7 +252,8 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name): X_test=X_test, test_index=test_index, tracks_list=tracks_list, - y_test=y_test) + y_test=y_test, + logger=logger) # Append the folded dataset to a list that will contain all the folded datasets predictions_df_list.append(df_pred_general) # Append each accuracy of the folded model to a list that contains all the accuracies resulted from each fold @@ -255,7 +263,7 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name): return predictions_df_list, accuracy_model, tracks_fold_indexing_dict -def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test): +def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): """ Creates a pandas DataFrame from each fold with the predictions in order later to extract the shuffled dataset with the tracks, the percentage @@ -269,6 +277,7 @@ def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_ test_index: tracks_list: y_test: + logger: Returns: A pandas DataFrame with the predictions at each fold. @@ -294,7 +303,7 @@ def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_ return df_pred_general -def export_accuracies(accuracy_model, config, class_name, exports_path, images_path): +def export_accuracies(accuracy_model, config, class_name, exports_path, images_path, logger): """ Args: @@ -303,6 +312,7 @@ def export_accuracies(accuracy_model, config, class_name, exports_path, images_p class_name: exports_path: images_path: + logger: Returns: @@ -321,15 +331,17 @@ def export_accuracies(accuracy_model, config, class_name, exports_path, images_p # Visualize accuracy for each iteration in a distribution plot create_accuracies_dist_plot(accuracies_list=accuracy_model, - images_path=images_path) + images_path=images_path, + logger=logger) -def create_dataset_predictions(list_df_predictions, class_name, dataset_path): +def create_dataset_predictions(list_df_predictions, class_name, dataset_path, logger): """ Args: list_df_predictions: class_name: dataset_path: + logger: Returns: @@ -346,7 +358,7 @@ def create_dataset_predictions(list_df_predictions, class_name, dataset_path): return df_concat_predictions -def create_accuracies_dist_plot(accuracies_list, images_path): +def create_accuracies_dist_plot(accuracies_list, images_path, logger): logger.info("Visualize accuracy for each iteration.") list_folds = [] counter_folds = 0 @@ -363,7 +375,7 @@ def create_accuracies_dist_plot(accuracies_list, images_path): logger.info("Plot saved successfully.") -def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path): +def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, logger): logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) fold_dict = {"fold": tracks_fold_indexing_dict} @@ -385,7 +397,7 @@ def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path) return fold_dict -def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path): +def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path, logger): logger.info("---- Evaluation to the {} dataset ----".format(set_name)) # Confusion Matrix logger.info("Exporting Confusion Matrix applied to the {} dataset..".format(set_name)) diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index 2be0d6c04..8e92a0f1f 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -25,7 +25,8 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l logger = setup_logger( exports_path=exports_path, - file_name="train_model_{}".format(class_name), + name="train_model_{}".format(class_name), + mode="w", level=log_level ) @@ -51,6 +52,7 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l tracks_list=tracks_listed_shuffled, train_class=class_name, exports_path=exports_path, + logger=logger ).create_df_tracks() logger.debug("Types of exported files from GT:") logger.debug("Type of features: {}".format(type(features))) @@ -62,7 +64,8 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l X=features, y=labels, tracks=tracks, - exports_path=exports_path) + exports_path=exports_path, + logger=logger) classification_time = model_manage.apply_processing() print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) logger.info("Classification ended successfully in {} minutes.".format(classification_time)) diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py index 33b46b3df..0c764de0e 100644 --- a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -8,36 +8,45 @@ import logging import os -from acousticbrainz.models import ACOUSTICBRAINZ_SKLEARN_LOGGER from acousticbrainz.models.sklearn.helper_functions.utils import create_directory -def setup_logger(exports_path, file_name, mode="w", level=logging.INFO): +def setup_logger(exports_path, name, mode, level=logging.INFO): """ Function to set up as many loggers as you want. It exports the logging results to a file in the relevant path that is determined by the configuration file. Args: exports_path: The path (str) the logging exports will be exported. - file_name: The name (str) of the logger. + name: The name (str) of the logger. level: The level (int) of the logging. Defaults to logging.INFO. mode: The mode (str) translated in write, append. Valid values ("w", "a") Returns: The logger object. """ - logger = logging.getLogger(ACOUSTICBRAINZ_SKLEARN_LOGGER) logs_path = create_directory(exports_path, "logs") + # Create a custom logger + logger = logging.getLogger(name) + + # Create handlers + c_handler = logging.StreamHandler() + f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(name)), mode=mode) + # Create formatters and add it to handlers - f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(file_name)), mode=mode) + c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + c_handler.setFormatter(c_format) f_handler.setFormatter(f_format) - # remove existing file handlers if any - logger.handlers = [handler for handler in logger.handlers if not isinstance(handler, logging.FileHandler)] + # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient + # when debugging and the code includes the logger initialization + if logger.hasHandlers(): + logger.handlers.clear() - # Add current file handler to the logger + # Add handlers to the logger + logger.addHandler(c_handler) logger.addHandler(f_handler) if level is None: diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py index a23473f64..0e9d568fe 100644 --- a/acousticbrainz/models/sklearn/model/predict.py +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -43,7 +43,8 @@ def load_best_model(self): def preprocessing(self): self.logger = setup_logger( exports_path=self.exports_path, - file_name="predict_{}".format(self.class_name), + name="predict_{}".format(self.class_name), + mode="w", level=self.log_level ) diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 76de2f595..78922830f 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -1,4 +1,3 @@ -import logging import os import yaml import pandas as pd @@ -9,9 +8,6 @@ from ..transformation.load_low_level import FeaturesDf -logger = logging.getLogger(__name__) - - class GroundTruthLoad: """ The Ground Truth data which contains the tracks and the corresponding @@ -31,7 +27,7 @@ def __init__(self, config, gt_filename, exports_path, log_level): self.exports_path = exports_path self.log_level = log_level - logger = "" + self.logger = "" self.class_dir = "" self.ground_truth_data = {} self.labeled_tracks = {} @@ -135,11 +131,12 @@ class DatasetExporter: """ TODO: Description """ - def __init__(self, config, tracks_list, train_class, exports_path): + def __init__(self, config, tracks_list, train_class, exports_path, logger): self.config = config self.tracks_list = tracks_list self.train_class = train_class self.exports_path = exports_path + self.logger = logger self.dataset_dir = "" self.class_dir = "" @@ -155,7 +152,7 @@ def create_df_tracks(self): TODO: Description """ - logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") + self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") self.dataset_dir = self.config.get("dataset_dir") print('DATASET-DIR', self.dataset_dir) dirpath = os.path.join(os.getcwd(), self.dataset_dir) @@ -163,47 +160,47 @@ def create_df_tracks(self): for (dirpath, dirnames, filenames) in os.walk(dirpath): low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] if len(low_level_list) != 0: - logger.info("Low-level features for the tracks found.") + self.logger.info("Low-level features for the tracks found.") # processing the names of the tracks that are inside both the GT file and the low-level json files # list with the tracks that are included in the low-level json files tracks_existing_list = [e for e in self.tracks_list for i in low_level_list if e[0] in i] # list with the low-level json tracks' paths that are included in tracks list tracks_existing_path_list = [i for e in self.tracks_list for i in low_level_list if e[0] in i] - logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) - logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) - logger.debug("{}".format(tracks_existing_list[:4])) - logger.debug("{}".format(tracks_existing_path_list[:4])) - logger.debug("The founded tracks tracks listed successfully.") - logger.debug("Generate random number within a given range of listed tracks:") + self.logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) + self.logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) + self.logger.debug("{}".format(tracks_existing_list[:4])) + self.logger.debug("{}".format(tracks_existing_path_list[:4])) + self.logger.debug("The founded tracks tracks listed successfully.") + self.logger.debug("Generate random number within a given range of listed tracks:") # Random number between 0 and length of listed tracks random_num = random.randrange(len(tracks_existing_list)) - logger.debug("Check if the tracks are the same in the same random index in both lists") - logger.debug("{}".format(tracks_existing_list[random_num])) - logger.debug("{}".format(tracks_existing_path_list[random_num])) + self.logger.debug("Check if the tracks are the same in the same random index in both lists") + self.logger.debug("{}".format(tracks_existing_list[random_num])) + self.logger.debug("{}".format(tracks_existing_path_list[random_num])) self.tracks_list = tracks_existing_list # create the dataframe with tracks that are bothe in low-level files and the GT file self.df_tracks = pd.DataFrame(data=self.tracks_list, columns=["track", self.train_class]) - logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) - logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") - logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + self.logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) + self.logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") + self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - logger.debug("Drop rows with NULL values if they exist..") + self.logger.debug("Drop rows with NULL values if they exist..") if self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape[0] != 0: self.df_tracks.dropna(inplace=True) - logger.debug("Check if there are NULL values after the cleaning process:") - logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - logger.debug("Re-index the tracks DF..") + self.logger.debug("Check if there are NULL values after the cleaning process:") + self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) + self.logger.debug("Re-index the tracks DF..") self.df_tracks = self.df_tracks.reset_index(drop=True) else: - logger.info("There are no NULL values found.") + self.logger.info("There are no NULL values found.") # export shuffled tracks to CSV format tracks_path = create_directory(self.exports_path, "tracks_csv_format") self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) - logger.debug("DF INFO:") - logger.debug("{}".format(self.df_tracks.info())) - logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( + self.logger.debug("DF INFO:") + self.logger.debug("{}".format(self.df_tracks.info())) + self.logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( self.df_tracks.select_dtypes(include=['object']).columns)) self.df_feats = FeaturesDf(df_tracks=self.df_tracks, @@ -211,11 +208,12 @@ def create_df_tracks(self): list_path_tracks=tracks_existing_path_list, config=self.config, exports_path=self.exports_path, + logger=self.logger, ).create_low_level_df() self.y = self.df_tracks[self.train_class].values - logger.info("Features, Labels, and Tracks are exported successfully..") + self.logger.info("Features, Labels, and Tracks are exported successfully..") return self.df_feats, self.y, self.df_tracks["track"].values else: - logger.error("No low-level data found.") + self.logger.error("No low-level data found.") return None, None, None diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py index 92ac4ca66..51f319db8 100644 --- a/acousticbrainz/models/sklearn/transformation/load_low_level.py +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -1,13 +1,9 @@ -import logging import os import json import pandas as pd from ..transformation.utils_preprocessing import flatten_dict_full -logger = logging.getLogger(__name__) - - class FeaturesDf: """ Features DataFrame object by the JSON low-level data. @@ -15,12 +11,13 @@ class FeaturesDf: df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, label, etc. """ - def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path): + def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, logger): self.df_tracks = df_tracks self.train_class = train_class self.list_path_tracks = list_path_tracks self.config = config self.exports_path = exports_path + self.logger = logger self.list_feats_tracks = [] self.counter_items_transformed = 0 self.df_feats_tracks = pd.DataFrame() @@ -34,7 +31,7 @@ def create_low_level_df(self): Returns: The low-level features (pandas DataFrame) from all the tracks in the collection. """ - logger.info("---- CREATE LOW LEVEL DATAFRAME ----") + self.logger.info("---- CREATE LOW LEVEL DATAFRAME ----") # clear the list if it not empty self.list_feats_tracks.clear() for track_low_level_path in self.list_path_tracks: @@ -43,7 +40,7 @@ def create_low_level_df(self): data_feats_item = json.load(f, strict=False) except Exception as e: print("Exception occurred in loading file:", e) - logger.warning("Exception occurred in loading file: {}".format(e)) + self.logger.warning("Exception occurred in loading file: {}".format(e)) # remove unnecessary features data try: if 'beats_position' in data_feats_item['rhythm']: @@ -61,20 +58,20 @@ def create_low_level_df(self): # The dictionary's keys list is transformed to type self.df_feats_tracks = pd.DataFrame(self.list_feats_tracks, columns=list(self.list_feats_tracks[0].keys())) - logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( + self.logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( self.df_feats_tracks.select_dtypes(include=['object']).columns)) - logger.info("Exporting low-level data (DataFrame)..") + self.logger.info("Exporting low-level data (DataFrame)..") return self.df_feats_tracks def check_processing_info(self): """ Prints some information about the low-level data to DataFrame transformation step and its middle processes. """ - logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) + self.logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) # The type of the dictionary's keys list is: - logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) + self.logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) # The dictionary's keys list is transformed to type - logger.info('Confirm the type of list transformation of features keys: {}' + self.logger.info('Confirm the type of list transformation of features keys: {}' .format(type(list(self.list_feats_tracks[0].keys())))) def export_tracks_feats_df(self): @@ -83,12 +80,12 @@ def export_tracks_feats_df(self): The tracks (pandas DataFrame) with all the ground truth data and the corresponding low-level data flattened. """ - logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") - logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) - logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) + self.logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") + self.logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) + self.logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) self.df_feats_label = pd.concat([self.df_tracks, self.df_feats_tracks], axis=1) - logger.info("FULL: {}".format(self.df_feats_label.shape)) - logger.info("COLUMNS CONTAIN OBJECTS: {}" + self.logger.info("FULL: {}".format(self.df_feats_label.shape)) + self.logger.info("COLUMNS CONTAIN OBJECTS: {}" .format(self.df_feats_label.select_dtypes(include=['object']).columns)) return self.df_feats_label diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py index d8e06524e..2396bcd97 100644 --- a/acousticbrainz/models/sklearn/transformation/transform.py +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -1,5 +1,3 @@ -import logging - import pandas as pd from termcolor import colored import collections @@ -21,16 +19,15 @@ except AttributeError: collectionsAbc = collections -logger = logging.getLogger(__name__) - class Transform: - def __init__(self, config, df_feats, process, train_class, exports_path): + def __init__(self, config, df_feats, process, train_class, exports_path, logger): self.config = config self.df_feats = df_feats self.process = process self.train_class = train_class self.exports_path = exports_path + self.logger = logger self.list_features = [] self.feats_cat_list = [] @@ -43,8 +40,8 @@ def __init__(self, config, df_feats, process, train_class, exports_path): def post_processing(self): print(colored("PROCESS: {}".format(self.process), "cyan")) - logger.debug("PROCESS: {}".format(self.process)) - logger.debug("Process: {}".format(self.config["processing"][self.process])) + self.logger.debug("PROCESS: {}".format(self.process)) + self.logger.debug("Process: {}".format(self.config["processing"][self.process])) # list_preprocesses = [] self.list_features = list(self.df_feats.columns) @@ -53,34 +50,34 @@ def post_processing(self): # clean list print(colored("Cleaning..", "yellow")) - logger.info("Cleaning..") + self.logger.info("Cleaning..") cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] - logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) + self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) # remove list print(colored("Removing unnecessary features..", "yellow")) - logger.info("Removing unnecessary features..") + self.logger.info("Removing unnecessary features..") if self.config["processing"][self.process][0]["transfo"] == "remove": remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) self.list_features = [x for x in self.list_features if x not in feats_remove_list] - logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) + self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) # enumerate list print(colored("Split numerical / categorical features..", "yellow")) if self.config["processing"][self.process][1]["transfo"] == "enumerate": enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) - logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] - logger.debug("List Num feats: {}".format(len(self.feats_num_list))) - logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) # BASIC if self.process == "basic": - logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -105,7 +102,7 @@ def post_processing(self): if self.process == "lowlevel" or self.process == "mfcc": sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) - logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -131,7 +128,7 @@ def post_processing(self): sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) feats_rem_list = feats_selector_list(self.df_feats, sel_list) self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] - logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)) @@ -154,7 +151,7 @@ def post_processing(self): # NORMALIZED if self.process == "normalized": - logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.feats_num_list)), ('minmax_scaler', MinMaxScaler()), @@ -181,9 +178,9 @@ def post_processing(self): feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] - logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) - logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) - logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + self.logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) num_norm_pipeline = Pipeline([ ("selector_num", DataFrameSelector(self.feats_num_list)), @@ -201,7 +198,7 @@ def post_processing(self): ]) self.feats_prepared = full_normalize_pipeline.fit_transform(self.df_feats) - logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) + self.logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) # save pipeline joblib.dump(full_normalize_pipeline, os.path.join(models_path, "full_normalize_pipeline_{}.pkl".format(self.process))) @@ -214,9 +211,9 @@ def post_processing(self): print(select_no_rename_list) new_feats_columns = select_rename_list + select_no_rename_list self.df_feats.columns = new_feats_columns - logger.debug("Normalized Features DF:") - logger.debug("\n{}".format(self.df_feats)) - logger.debug("Shape: {}".format(self.df_feats.shape)) + self.logger.debug("Normalized Features DF:") + self.logger.debug("\n{}".format(self.df_feats)) + self.logger.debug("Shape: {}".format(self.df_feats.shape)) feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index 3f4108218..c14402fca 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -1,5 +1,3 @@ -import logging - import pandas as pd from termcolor import colored import collections @@ -17,11 +15,9 @@ except AttributeError: collectionsAbc = collections -logger = logging.getLogger(__name__) - class TransformPredictions: - def __init__(self, config, df_feats, process, train_class, exports_path): + def __init__(self, config, df_feats, process, train_class, exports_path, logger): self.config = config self.df_feats = df_feats self.process = process From fbebf59e2b5533ff7e9889ed7e2f4aa86e7a5a26 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 6 Jul 2021 22:53:13 +0530 Subject: [PATCH 57/64] Fix missing logger instance error --- .../models/sklearn/classification/classification_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py index 455230a92..773281bf0 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task.py +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -52,7 +52,7 @@ def run(self): y=self.y, tr_processes=self.training_processes, exports_path=self.exports_path, - logger=logger + logger=self.logger ) grid_svm_train.train_grid_search_clf() grid_svm_train.export_best_classifier() From 57313b2eb4a57b712cc7dede89dd975cd849a045 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 7 Jul 2021 15:07:55 +0530 Subject: [PATCH 58/64] Change some one use classes to functions to simplify code --- .../classification_task_manager.py | 4 +- .../sklearn/classification/train_class.py | 8 +- .../models/sklearn/helper_functions/utils.py | 113 +++++-------- .../transformation/load_ground_truth.py | 152 ++++++++---------- .../transformation/transform_predictions.py | 13 -- 5 files changed, 109 insertions(+), 181 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py index d5c49ee98..0248c621b 100644 --- a/acousticbrainz/models/sklearn/classification/classification_task_manager.py +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -3,7 +3,7 @@ from termcolor import colored from datetime import datetime -from ..helper_functions.utils import create_directory, TrainingProcesses +from ..helper_functions.utils import create_directory, extract_training_processes from ..classification.classification_task import ClassificationTask @@ -97,7 +97,7 @@ def apply_processing(self): Evaluation steps extraction and classification task execution for each step. """ start_time = time() - training_processes = TrainingProcesses(self.config).training_processes() + training_processes = extract_training_processes(self.config) self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) for classifier in self.config["classifiers"].keys(): print("Before Classification task: ", classifier) diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index 8e92a0f1f..0979029fa 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -3,9 +3,8 @@ import yaml from ..helper_functions.logging_tool import setup_logger -from ..transformation.load_ground_truth import GroundTruthLoad +from ..transformation.load_ground_truth import GroundTruthLoad, create_df_tracks from ..classification.classification_task_manager import ClassificationTaskManager -from ..transformation.load_ground_truth import DatasetExporter def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, log_level): @@ -48,12 +47,11 @@ def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, l print("First N sample of shuffled tracks: \n{}".format(tracks_listed_shuffled[:4])) # create the exports with the features DF, labels, and tracks together - features, labels, tracks = DatasetExporter(config=config, + features, labels, tracks = create_df_tracks(config=config, tracks_list=tracks_listed_shuffled, train_class=class_name, exports_path=exports_path, - logger=logger - ).create_df_tracks() + logger=logger) logger.debug("Types of exported files from GT:") logger.debug("Type of features: {}".format(type(features))) logger.debug("Type of labels: {}".format(type(labels))) diff --git a/acousticbrainz/models/sklearn/helper_functions/utils.py b/acousticbrainz/models/sklearn/helper_functions/utils.py index ebd82dc9c..af08a7f86 100644 --- a/acousticbrainz/models/sklearn/helper_functions/utils.py +++ b/acousticbrainz/models/sklearn/helper_functions/utils.py @@ -22,35 +22,6 @@ def load_yaml(path_to_file, file): return None -class DfChecker: - """ - - """ - def __init__(self, df_check): - """ - Args: - df_check: - """ - self.df_check = df_check - - def check_df_info(self): - """ - Prints information about the Pandas DataFrame that is generated from the relevant process. - """ - print("Features DataFrame head:") - print(self.df_check.head()) - print() - print("Information:") - print(self.df_check.info()) - print() - print("Shape:", self.df_check.shape) - print("Number of columns:", len(list(self.df_check.columns))) - - if "category" in self.df_check.columns: - print("Track categories distribution:") - print(self.df_check["category"].value_counts()) - - def create_directory(exports_path, directory): # find dynamically the current script directory full_path = os.path.join(exports_path, directory) @@ -77,53 +48,43 @@ def change_weights_val(i): return i -class TrainingProcesses: - """ - Extracts the pre-processing steps that are specified in "List of classifiers +def extract_training_processes(config): + """ Extracts the pre-processing steps that are specified in "List of classifiers to be trained" section of the configuration template. These are the amount of the prep-processing steps with the relevant training that will be executed. - """ - def __init__(self, config): - """ - Args: - config: The configuration data. - """ - self.config = config - - def training_processes(self): - """ - Returns: - A list of the processes that have been identified with the corresponding parameter grid. - """ - evaluations = self.config["evaluations"]["nfoldcrossvalidation"] - print("Evaluations countered: {}".format(len(evaluations))) - evaluation_counter = 0 - trainings_counted = 0 - processes = [] - for evaluation in evaluations: - for nfold_number in evaluation["nfold"]: - classifiers = self.config["classifiers"]["svm"] - for classifier in classifiers: - for pre_processing in classifier["preprocessing"]: - for clf_type in classifier["type"]: - if clf_type == "C-SVC": - process_dict = { - "evaluation": evaluation_counter, - "classifier": clf_type, - "preprocess": pre_processing, - "kernel": [i.lower() for i in classifier["kernel"]], # lowercase the values - "C": [2 ** x for x in classifier["C"]], # 2 ** c - "gamma": [2 ** x for x in classifier["gamma"]], # 2 ** gamma - "balance_classes": [change_weights_val(i) for i in classifier["balance_classes"]], - "n_fold": nfold_number - } - # append the pre-processing steps list - processes.append(process_dict) - # increase counter by 1 - trainings_counted += 1 - # increase evaluation counter by 1 - evaluation_counter += 1 - print("Trainings to be applied: {}".format(trainings_counted)) - - return processes + Returns: + A list of the processes that have been identified with the corresponding parameter grid. + """ + evaluations = config["evaluations"]["nfoldcrossvalidation"] + print("Evaluations countered: {}".format(len(evaluations))) + evaluation_counter = 0 + trainings_counted = 0 + processes = [] + for evaluation in evaluations: + for nfold_number in evaluation["nfold"]: + classifiers = config["classifiers"]["svm"] + for classifier in classifiers: + for pre_processing in classifier["preprocessing"]: + for clf_type in classifier["type"]: + if clf_type == "C-SVC": + process_dict = { + "evaluation": evaluation_counter, + "classifier": clf_type, + "preprocess": pre_processing, + "kernel": [i.lower() for i in classifier["kernel"]], # lowercase the values + "C": [2 ** x for x in classifier["C"]], # 2 ** c + "gamma": [2 ** x for x in classifier["gamma"]], # 2 ** gamma + "balance_classes": [change_weights_val(i) for i in classifier["balance_classes"]], + "n_fold": nfold_number + } + # append the pre-processing steps list + processes.append(process_dict) + # increase counter by 1 + trainings_counted += 1 + # increase evaluation counter by 1 + evaluation_counter += 1 + + print("Trainings to be applied: {}".format(trainings_counted)) + + return processes diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 78922830f..c4963759b 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -127,93 +127,75 @@ def count_json_low_level_files(self): print("counted json files: {}".format(counter)) -class DatasetExporter: +def create_df_tracks(config, tracks_list, train_class, exports_path, logger): """ TODO: Description - """ - def __init__(self, config, tracks_list, train_class, exports_path, logger): - self.config = config - self.tracks_list = tracks_list - self.train_class = train_class - self.exports_path = exports_path - self.logger = logger - - self.dataset_dir = "" - self.class_dir = "" - self.df_tracks = pd.DataFrame() - self.df_feats = pd.DataFrame() - self.y = [] - - - def create_df_tracks(self): - """ + Returns: TODO: Description - Returns: - TODO: Description - """ + """ - self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") - self.dataset_dir = self.config.get("dataset_dir") - print('DATASET-DIR', self.dataset_dir) - dirpath = os.path.join(os.getcwd(), self.dataset_dir) - low_level_list = list() - for (dirpath, dirnames, filenames) in os.walk(dirpath): - low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] - if len(low_level_list) != 0: - self.logger.info("Low-level features for the tracks found.") - # processing the names of the tracks that are inside both the GT file and the low-level json files - # list with the tracks that are included in the low-level json files - tracks_existing_list = [e for e in self.tracks_list for i in low_level_list if e[0] in i] - # list with the low-level json tracks' paths that are included in tracks list - tracks_existing_path_list = [i for e in self.tracks_list for i in low_level_list if e[0] in i] - self.logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) - self.logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) - self.logger.debug("{}".format(tracks_existing_list[:4])) - self.logger.debug("{}".format(tracks_existing_path_list[:4])) - self.logger.debug("The founded tracks tracks listed successfully.") - self.logger.debug("Generate random number within a given range of listed tracks:") - # Random number between 0 and length of listed tracks - random_num = random.randrange(len(tracks_existing_list)) - self.logger.debug("Check if the tracks are the same in the same random index in both lists") - self.logger.debug("{}".format(tracks_existing_list[random_num])) - self.logger.debug("{}".format(tracks_existing_path_list[random_num])) - - self.tracks_list = tracks_existing_list - # create the dataframe with tracks that are bothe in low-level files and the GT file - self.df_tracks = pd.DataFrame(data=self.tracks_list, columns=["track", self.train_class]) - self.logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) - self.logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") - self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - - self.logger.debug("Drop rows with NULL values if they exist..") - if self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape[0] != 0: - self.df_tracks.dropna(inplace=True) - self.logger.debug("Check if there are NULL values after the cleaning process:") - self.logger.debug("{}".format(self.df_tracks[self.df_tracks.isnull().any(axis=1)].shape)) - self.logger.debug("Re-index the tracks DF..") - self.df_tracks = self.df_tracks.reset_index(drop=True) - else: - self.logger.info("There are no NULL values found.") - - # export shuffled tracks to CSV format - tracks_path = create_directory(self.exports_path, "tracks_csv_format") - self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) - self.logger.debug("DF INFO:") - self.logger.debug("{}".format(self.df_tracks.info())) - self.logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( - self.df_tracks.select_dtypes(include=['object']).columns)) - - self.df_feats = FeaturesDf(df_tracks=self.df_tracks, - train_class=self.train_class, - list_path_tracks=tracks_existing_path_list, - config=self.config, - exports_path=self.exports_path, - logger=self.logger, - ).create_low_level_df() - - self.y = self.df_tracks[self.train_class].values - self.logger.info("Features, Labels, and Tracks are exported successfully..") - return self.df_feats, self.y, self.df_tracks["track"].values + logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") + dataset_dir = config.get("dataset_dir") + print('DATASET-DIR', dataset_dir) + dirpath = os.path.join(os.getcwd(), dataset_dir) + low_level_list = list() + for (dirpath, dirnames, filenames) in os.walk(dirpath): + low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] + if len(low_level_list) != 0: + logger.info("Low-level features for the tracks found.") + # processing the names of the tracks that are inside both the GT file and the low-level json files + # list with the tracks that are included in the low-level json files + tracks_existing_list = [e for e in tracks_list for i in low_level_list if e[0] in i] + # list with the low-level json tracks' paths that are included in tracks list + tracks_existing_path_list = [i for e in tracks_list for i in low_level_list if e[0] in i] + logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) + logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) + logger.debug("{}".format(tracks_existing_list[:4])) + logger.debug("{}".format(tracks_existing_path_list[:4])) + logger.debug("The founded tracks tracks listed successfully.") + logger.debug("Generate random number within a given range of listed tracks:") + # Random number between 0 and length of listed tracks + random_num = random.randrange(len(tracks_existing_list)) + logger.debug("Check if the tracks are the same in the same random index in both lists") + logger.debug("{}".format(tracks_existing_list[random_num])) + logger.debug("{}".format(tracks_existing_path_list[random_num])) + + tracks_list = tracks_existing_list + # create the dataframe with tracks that are bothe in low-level files and the GT file + df_tracks = pd.DataFrame(data=tracks_list, columns=["track", train_class]) + logger.debug("Shape of tracks DF created before cleaning: {}".format(df_tracks.shape)) + logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") + logger.debug("{}".format(df_tracks[df_tracks.isnull().any(axis=1)].shape)) + + logger.debug("Drop rows with NULL values if they exist..") + if df_tracks[df_tracks.isnull().any(axis=1)].shape[0] != 0: + df_tracks.dropna(inplace=True) + logger.debug("Check if there are NULL values after the cleaning process:") + logger.debug("{}".format(df_tracks[df_tracks.isnull().any(axis=1)].shape)) + logger.debug("Re-index the tracks DF..") + df_tracks = df_tracks.reset_index(drop=True) else: - self.logger.error("No low-level data found.") - return None, None, None + logger.info("There are no NULL values found.") + + # export shuffled tracks to CSV format + tracks_path = create_directory(exports_path, "tracks_csv_format") + df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(train_class))) + logger.debug("DF INFO:") + logger.debug("{}".format(df_tracks.info())) + logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( + df_tracks.select_dtypes(include=['object']).columns)) + + df_feats = FeaturesDf(df_tracks=df_tracks, + train_class=train_class, + list_path_tracks=tracks_existing_path_list, + config=config, + exports_path=exports_path, + logger=logger + ).create_low_level_df() + + y = df_tracks[train_class].values + logger.info("Features, Labels, and Tracks are exported successfully..") + return df_feats, y, df_tracks["track"].values + else: + logger.error("No low-level data found.") + return None, None, None diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py index c14402fca..81072ef5f 100644 --- a/acousticbrainz/models/sklearn/transformation/transform_predictions.py +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -5,7 +5,6 @@ import os import six -from sklearn.base import BaseEstimator, TransformerMixin from ..transformation.utils_preprocessing import list_descr_handler from ..transformation.utils_preprocessing import feats_selector_list @@ -152,15 +151,3 @@ def post_processing(self): self.feats_prepared = full_gauss_pipeline.transform(self.df_feats) return self.feats_prepared - - -# Create a class to select numerical or categorical columns -class DataFrameSelector(BaseEstimator, TransformerMixin): - def __init__(self, attribute_names): - self.attribute_names = attribute_names - - def fit(self, X, y=None): - return self - - def transform(self, X): - return X[self.attribute_names].values From c8b73d23115eebb3a5b4cefe55647cf20b2748cb Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 7 Jul 2021 17:34:00 +0530 Subject: [PATCH 59/64] Simplify GroundTruthLoad class Eliminate unused methods and move useful ones to individual methods. --- .../sklearn/classification/train_class.py | 9 +- .../transformation/load_ground_truth.py | 145 ++++-------------- 2 files changed, 36 insertions(+), 118 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py index 0979029fa..15296f221 100644 --- a/acousticbrainz/models/sklearn/classification/train_class.py +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -3,19 +3,20 @@ import yaml from ..helper_functions.logging_tool import setup_logger -from ..transformation.load_ground_truth import GroundTruthLoad, create_df_tracks +from ..transformation.load_ground_truth import load_local_ground_truth, export_gt_tracks, create_df_tracks from ..classification.classification_task_manager import ClassificationTaskManager def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, log_level): exports_path = config["exports_path"] - gt_data = GroundTruthLoad(config, gt_file, exports_path, log_level) + ground_truth_data = load_local_ground_truth(gt_file) # tracks shuffled and exported - tracks_listed_shuffled = gt_data.export_gt_tracks() + tracks_listed_shuffled = export_gt_tracks(ground_truth_data, config.get("seed")) # class to train - class_name = gt_data.export_train_class() + class_name = ground_truth_data["className"] config["class_name"] = class_name + print("EXPORT CLASS NAME: {}".format(class_name)) config = update_parameters(config=config, c_values=c_values, diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index c4963759b..486cfdb9f 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -1,130 +1,47 @@ import os import yaml import pandas as pd -from pprint import pprint from termcolor import colored import random from ..helper_functions.utils import create_directory from ..transformation.load_low_level import FeaturesDf -class GroundTruthLoad: - """ - The Ground Truth data which contains the tracks and the corresponding - labels they belong to. The path to the related tracks' low-level data - (features in JSON format) can be extracted from this file too. - """ - def __init__(self, config, gt_filename, exports_path, log_level): - """ - Args: - config: - gt_filename: - exports_path: - log_level: - """ - self.config = config - self.gt_filename = gt_filename - self.exports_path = exports_path - self.log_level = log_level - - self.logger = "" - self.class_dir = "" - self.ground_truth_data = {} - self.labeled_tracks = {} - self.train_class = "" - self.dataset_dir = "" - self.tracks = [] - - self.load_local_ground_truth() - - def load_local_ground_truth(self): - """ - Loads the the ground truth file. The dataset directory is specified through - the parsing arguments of the create_classification_project method. - """ - self.dataset_dir = self.config.get("dataset_dir") - with open(self.gt_filename, "r") as stream: - try: - self.ground_truth_data = yaml.safe_load(stream) - print("Ground truth file loaded.") - except yaml.YAMLError as exc: - print("Error in loading the ground truth file.") - print(exc) - - def export_train_class(self): - """ - Returns: - The target class to be modeled. - """ - self.train_class = self.ground_truth_data["className"] - print("EXPORT CLASS NAME: {}".format(self.train_class)) - return self.train_class +def load_local_ground_truth(gt_filename): + """ Loads the the ground truth file. - def export_gt_tracks(self): - """ - It takes a dictionary of the tracks from the groundtruth and it transforms it - to a list of tuples (track, label). Then it shuffles the list based on the seed - specified in the configuration file, and returns that shuffled list. - Returns: - A list of tuples with the tracks and their corresponding labels. - """ - self.labeled_tracks = self.ground_truth_data["groundTruth"] - tracks_list = [] - for track, label in self.labeled_tracks.items(): - tracks_list.append((track, label)) - print(colored("SEED is set to: {}".format(self.config.get("seed"), "cyan"))) - random.seed(a=self.config.get("seed")) - random.shuffle(tracks_list) - print("Listed tracks in GT file: {}".format(len(tracks_list))) - return tracks_list - - def check_ground_truth_data(self): - """ - Prints a dictionary of the groundtruth data in the corresponding yaml file. - It contains the target class and the tracks. - """ - pprint(self.ground_truth_data) + The Ground Truth data which contains the tracks and the corresponding + labels they belong to. The path to the related tracks' low-level data + (features in JSON format) can be extracted from this file too. + """ + with open(gt_filename, "r") as stream: + try: + ground_truth_data = yaml.safe_load(stream) + print("Ground truth file loaded.") + return ground_truth_data + except yaml.YAMLError as exc: + print("Error in loading the ground truth file.") + print(exc) - def check_ground_truth_info(self): - """ - Prints information about the groundtruth data that is loaded in a dictionary: - * The target class - * The tracks with their labels - * The tracks themselves - """ - len(self.ground_truth_data["groundTruth"].keys()) - print("Ground truth data class/target: {}".format(self.ground_truth_data["className"])) - print("Label tracks: {}".format(type(self.labeled_tracks))) - print("Ground truth data keys - tracks: {}".format(len(self.ground_truth_data["groundTruth"].keys()))) - def check_tracks_folders(self): - """ - Prints the directories that contain the low-level data. - """ - if len(self.labeled_tracks.keys()) is not 0: - folders = [] - for key in self.labeled_tracks: - key = key.split('/') - path_sub_dir = '/'.join(key[:-1]) - folders.append(path_sub_dir) - folders = set(folders) - folders = list(folders) - folders.sort() - print("Directories that contain the low-level JSON data:") - print("{}".format(folders)) +def export_gt_tracks(ground_truth_data, seed): + """ + It takes a dictionary of the tracks from the groundtruth and it transforms it + to a list of tuples (track, label). Then it shuffles the list based on the seed + specified in the configuration file, and returns that shuffled list. - def count_json_low_level_files(self): - """ - Prints the JSON low-level data that is contained inside the dataset directory (the dataset - directory is declared in configuration file). - """ - counter = 0 - for root, dirs, files in os.walk(os.path.join(os.getcwd(), self.dataset_dir)): - for file in files: - if file.endswith(".json"): - # print(os.path.join(root, file)) - counter += 1 - print("counted json files: {}".format(counter)) + Returns: + A list of tuples with the tracks and their corresponding labels. + """ + labeled_tracks = ground_truth_data["groundTruth"] + tracks_list = [] + for track, label in labeled_tracks.items(): + tracks_list.append((track, label)) + print(colored("SEED is set to: {}".format(seed, "cyan"))) + random.seed(a=seed) + random.shuffle(tracks_list) + print("Listed tracks in GT file: {}".format(len(tracks_list))) + return tracks_list def create_df_tracks(config, tracks_list, train_class, exports_path, logger): From 0643e38a469cd9ed17cbee846b6ea1e0c459a19e Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 7 Jul 2021 19:45:21 +0530 Subject: [PATCH 60/64] Simplify FeaturesDf by removing redundant methods and useful one out to function --- .../transformation/load_ground_truth.py | 10 +- .../sklearn/transformation/load_low_level.py | 121 ++++++------------ 2 files changed, 40 insertions(+), 91 deletions(-) diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py index 486cfdb9f..79b798b81 100644 --- a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -4,7 +4,7 @@ from termcolor import colored import random from ..helper_functions.utils import create_directory -from ..transformation.load_low_level import FeaturesDf +from ..transformation.load_low_level import create_low_level_features_df def load_local_ground_truth(gt_filename): @@ -102,13 +102,7 @@ def create_df_tracks(config, tracks_list, train_class, exports_path, logger): logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( df_tracks.select_dtypes(include=['object']).columns)) - df_feats = FeaturesDf(df_tracks=df_tracks, - train_class=train_class, - list_path_tracks=tracks_existing_path_list, - config=config, - exports_path=exports_path, - logger=logger - ).create_low_level_df() + df_feats = create_low_level_features_df(tracks_existing_path_list, logger) y = df_tracks[train_class].values logger.info("Features, Labels, and Tracks are exported successfully..") diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py index 51f319db8..6fd418aed 100644 --- a/acousticbrainz/models/sklearn/transformation/load_low_level.py +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -1,91 +1,46 @@ -import os import json import pandas as pd from ..transformation.utils_preprocessing import flatten_dict_full -class FeaturesDf: +def create_low_level_features_df(list_path_tracks, logger): """ - Features DataFrame object by the JSON low-level data. - Attributes: - df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, - label, etc. + Creates the low-level DataFrame. Cleans also the low-level data from the unnecessary features before creating + the DF. + Returns: + The low-level features (pandas DataFrame) from all the tracks in the collection. """ - def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, logger): - self.df_tracks = df_tracks - self.train_class = train_class - self.list_path_tracks = list_path_tracks - self.config = config - self.exports_path = exports_path - self.logger = logger - self.list_feats_tracks = [] - self.counter_items_transformed = 0 - self.df_feats_tracks = pd.DataFrame() - self.df_feats_label = pd.DataFrame() + logger.info("---- CREATE LOW LEVEL DATAFRAME ----") + + list_feats_tracks = [] + counter_items_transformed = 0 + + for track_low_level_path in list_path_tracks: + try: + f = open(track_low_level_path) + data_feats_item = json.load(f, strict=False) + except Exception as e: + print("Exception occurred in loading file:", e) + logger.warning("Exception occurred in loading file: {}".format(e)) + # remove unnecessary features data + try: + if 'beats_position' in data_feats_item['rhythm']: + del data_feats_item['rhythm']['beats_position'] + except Exception as e: + print("There is no 'rhythm' key in the low level data. Exception:", e) + + # data dictionary transformed to a fully flattened dictionary + data_feats_item = flatten_dict_full(data_feats_item) + + # append to a full tracks features pandas df + list_feats_tracks.append(dict(data_feats_item)) + + counter_items_transformed += 1 + + # The dictionary's keys list is transformed to type + df_feats_tracks = pd.DataFrame(list_feats_tracks, columns=list(list_feats_tracks[0].keys())) + logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( + df_feats_tracks.select_dtypes(include=['object']).columns)) + logger.info("Exporting low-level data (DataFrame)..") + return df_feats_tracks - - def create_low_level_df(self): - """ - Creates the low-level DataFrame. Cleans also the low-level data from the unnecessary features before creating - the DF. - Returns: - The low-level features (pandas DataFrame) from all the tracks in the collection. - """ - self.logger.info("---- CREATE LOW LEVEL DATAFRAME ----") - # clear the list if it not empty - self.list_feats_tracks.clear() - for track_low_level_path in self.list_path_tracks: - try: - f = open(track_low_level_path) - data_feats_item = json.load(f, strict=False) - except Exception as e: - print("Exception occurred in loading file:", e) - self.logger.warning("Exception occurred in loading file: {}".format(e)) - # remove unnecessary features data - try: - if 'beats_position' in data_feats_item['rhythm']: - del data_feats_item['rhythm']['beats_position'] - except Exception as e: - print("There is no 'rhythm' key in the low level data. Exception:", e) - - # data dictionary transformed to a fully flattened dictionary - data_feats_item = flatten_dict_full(data_feats_item) - - # append to a full tracks features pandas df - self.list_feats_tracks.append(dict(data_feats_item)) - - self.counter_items_transformed += 1 - - # The dictionary's keys list is transformed to type - self.df_feats_tracks = pd.DataFrame(self.list_feats_tracks, columns=list(self.list_feats_tracks[0].keys())) - self.logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( - self.df_feats_tracks.select_dtypes(include=['object']).columns)) - self.logger.info("Exporting low-level data (DataFrame)..") - return self.df_feats_tracks - - def check_processing_info(self): - """ - Prints some information about the low-level data to DataFrame transformation step and its middle processes. - """ - self.logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) - # The type of the dictionary's keys list is: - self.logger.info('Type of the list of features keys: {}'.format(type(self.list_feats_tracks[0].keys()))) - # The dictionary's keys list is transformed to type - self.logger.info('Confirm the type of list transformation of features keys: {}' - .format(type(list(self.list_feats_tracks[0].keys())))) - - def export_tracks_feats_df(self): - """ - Returns: - The tracks (pandas DataFrame) with all the ground truth data and the - corresponding low-level data flattened. - """ - self.logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") - self.logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) - self.logger.info("LOW LEVEL: {}".format(self.df_feats_tracks.shape)) - - self.df_feats_label = pd.concat([self.df_tracks, self.df_feats_tracks], axis=1) - self.logger.info("FULL: {}".format(self.df_feats_label.shape)) - self.logger.info("COLUMNS CONTAIN OBJECTS: {}" - .format(self.df_feats_label.select_dtypes(include=['object']).columns)) - return self.df_feats_label From 2d7d50cf118b8b3a8769c7c3d37cf500f6545637 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 7 Jul 2021 22:07:07 +0530 Subject: [PATCH 61/64] Fix file and error handling in features_df --- .../models/sklearn/transformation/load_low_level.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py index 6fd418aed..d1698c81f 100644 --- a/acousticbrainz/models/sklearn/transformation/load_low_level.py +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -17,17 +17,16 @@ def create_low_level_features_df(list_path_tracks, logger): for track_low_level_path in list_path_tracks: try: - f = open(track_low_level_path) - data_feats_item = json.load(f, strict=False) - except Exception as e: - print("Exception occurred in loading file:", e) - logger.warning("Exception occurred in loading file: {}".format(e)) + with open(track_low_level_path) as f: + data_feats_item = json.load(f, strict=False) + except Exception: + logger.error("Exception occurred in loading file:", exc_info=True) # remove unnecessary features data try: if 'beats_position' in data_feats_item['rhythm']: del data_feats_item['rhythm']['beats_position'] - except Exception as e: - print("There is no 'rhythm' key in the low level data. Exception:", e) + except KeyError: + logger.error("There is no 'rhythm' key in the low level data.", exc_info=True) # data dictionary transformed to a fully flattened dictionary data_feats_item = flatten_dict_full(data_feats_item) From d3fd8a7297d1cdb56c20ffc3e6b20d9e7e43d1b4 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 7 Jul 2021 23:38:00 +0530 Subject: [PATCH 62/64] Simplify ConfusionMatrixCreation --- .../classification/confusion_matrix_export.py | 247 +----------------- .../sklearn/classification/matrix_creation.py | 7 +- 2 files changed, 11 insertions(+), 243 deletions(-) diff --git a/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py b/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py index 3822e4403..aa382c2ae 100644 --- a/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py +++ b/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py @@ -1,245 +1,14 @@ # encoding: utf-8 from collections import defaultdict import json -from math import sqrt +def load_as_confusion_matrix(filename): + with open(filename) as f: + data = json.load(f) -class ConfusionMatrixCreation: + # convert to a defaultdict the data we just loaded + matrix = defaultdict(lambda: defaultdict(list)) + for k, v in data['matrix'].items(): + matrix[k] = defaultdict(list, v) - def __init__(self): - self.matrix = defaultdict(lambda: defaultdict(list)) - self.folds = dict() - - def load(self, filename): - with open(filename) as f: - data = json.load(f) - # print(data) - - # convert to a defaultdict the data we just loaded - self.matrix = defaultdict(lambda: defaultdict(list)) - for k, v in data['matrix'].items(): - self.matrix[k] = defaultdict(list, v) - # print(self.matrix[k]) - - if "fold" in data: - self.folds = data['fold'] - - def save(self, filename): - # convert to "normal" dicts before saving - data = { - 'matrix': dict((k, dict(v)) for k, v in self.matrix.items()), - 'fold': self.folds - } - # with open(filename, 'w') as f: - # yaml.dump(data, f) - - with open(filename, 'w') as f: - json.dump(data, f) - - def add(self, expected, predicted, name=''): - self.matrix[expected][predicted] += [name] - - def addNfold(self, expected, predicted, name, nfold): - self.matrix[expected][predicted] += [name] - self.folds[name] = nfold - - def matrixNfold(self, nfold): - nfoldDict = defaultdict(lambda: defaultdict(list)) - for c in self.matrix: - for d in self.matrix[c]: - for e in self.matrix[c][d]: - if self.folds[e] == nfold: - nfoldDict[c][d].append(e) - return nfoldDict - - def stdNfold(self, normalizedAccuracies=False): - """Return standard deviation of the accuracies across folds.""" - - if normalizedAccuracies: - accuracies = self.normalizedAccuraciesNFold() - else: - accuracies = self.accuraciesNFold() - - # TODO the following lines compute standard deviation. In - # the future we can use stdev method from the statistics - # package, shipped by default since Python 3.4 - acc_mean = sum(accuracies) / len(accuracies) - - return sqrt(sum([(x - acc_mean) * (x - acc_mean) - for x in accuracies]) / len(accuracies)) - - def classes(self): - allClasses = set() - - for c in self.matrix: - allClasses.add(c) - for d in self.matrix[c]: - allClasses.add(d) - - return allClasses - - def total(self): - """Return the total number of classification instances.""" - result = 0 - for c in self.matrix: - for d in self.matrix[c]: - result += len(self.matrix[c][d]) - return result - - def totalNfold(self, fold): - """Return the total number of classification instances for a given fold.""" - matrix = self.matrixNfold(fold) - result = 0 - for c in matrix: - for d in matrix[c]: - result += len(matrix[c][d]) - return result - - def correct(self): - """Return the number of correctly classified instances.""" - result = 0 - for c in self.matrix: - result += len(self.matrix[c][c]) - return result - - def correctNfold(self, fold): - """Return the number of correctly classified instances for a given fold.""" - matrix = self.matrixNfold(fold) - result = 0 - for c in matrix: - result += len(matrix[c][c]) - return result - - def toDict(self): - """Format nicely the confusion matrix as normal dict, replace list of - instances by number of them.""" - allClasses = self.classes() - - # build resulting dict - result = {} - for c in allClasses: - result[c] = {} - for d in allClasses: - result[c][d] = len(self.matrix[c][d]) - - return result - - def results(self): - good = self.correct() - total = self.total() - return 'Correctly classified: %d out of %d (%d%%)' % (good, total, 100*good//total) - - def accuraciesNFold(self): - '''Return accuracies per fold.''' - folds = set(self.folds.values()) - - if not bool(folds): - raise('This matrix does not contain information about folds') - - return [self.correctNfold(f) * 100. / self.totalNfold(f) - for f in folds] - - def normalizedAccuraciesNFold(self): - '''Returns the normalized accuracy.''' - folds = set(self.folds.values()) - - if not bool(folds): - raise('This matrix does not contain information about folds') - - foldAccuracies = [] - - for f in folds: - classAccuracies = [] - matrix = self.matrixNfold(f) - - for c in matrix: - classElements = 0 - for e in matrix[c]: - classElements += len(matrix[c][e]) - - classAccuracies.append(len(matrix[c][c]) * 100. / classElements) - - foldAccuracies.append(sum(classAccuracies) / len(classAccuracies)) - - return foldAccuracies - - def accuracy(self): - accuracies = self.accuraciesNFold() - return sum(accuracies) / len(accuracies) - - def normalizedAccuracy(self): - accuracies = self.normalizedAccuraciesNFold() - return sum(accuracies) / len(accuracies) - - def toHtml(self, standAlone = True, embedStyleSheet = True): - html = '

Predicted (%)

' - html += '' - - html += '' - - labels = self.classes() - diff --git a/models/sklearn/classification/evaluation.py b/models/sklearn/classification/evaluation.py index 5cf5c0d6a..082baef99 100644 --- a/models/sklearn/classification/evaluation.py +++ b/models/sklearn/classification/evaluation.py @@ -5,165 +5,220 @@ import seaborn as sns import matplotlib.pyplot as plt from termcolor import colored -from pprint import pprint import yaml from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix, classification_report import joblib -import requests -from utils import load_yaml, FindCreateDirectory, TrainingProcesses -from transformation.transform import Transform -from transformation.utils_preprocessing import flatten_dict_full -from classification.report_files_export import export_report -from logging_tool import LoggerSetup +from ..helper_functions.utils import FindCreateDirectory +from ..transformation.transform import Transform +from ..classification.report_files_export import export_report +from ..helper_functions.logging_tool import LoggerSetup -def fold_evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, log_level): - # logger setup + +def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, log_level): + print(colored("------ EVALUATION and FOLDING ------", "yellow")) logger = LoggerSetup(config=config, exports_path=exports_path, - name="evaluation_{}".format(class_name), + name="train_model_{}".format(class_name), train_class=class_name, - mode="w", + mode="a", level=log_level).setup_logger() - logger.info("---- EVALUATION of the model in the Folded dataset as well as in the whole dataset ----") - print("---- EVALUATION of the model in the Folded dataset as well as in the whole dataset ----") - print(colored("Evaluation and Folding..", "yellow")) + logger.info("---- Folded evaluation of the model in the dataset ----") logger.info("number of folds set to config: {}".format(n_fold)) logger.debug("Sample of shuffled tracks tracks:") logger.debug("{}".format(tracks[:5])) logger.debug("Tracks list length: {}".format(len(tracks))) - exports_dir = "{}_{}".format(config.get("exports_directory"), class_name) + # load project directory and the corresponding save paths + exports_dir = config.get("exports_directory") + + dataset_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "dataset")).inspect_directory() + models_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "models")).inspect_directory() + images_path = FindCreateDirectory(exports_path, + os.path.join(exports_dir, "images")).inspect_directory() # load best model load_model_params_path = os.path.join(exports_path, exports_dir, "best_model_{}.json".format(class_name)) with open(load_model_params_path) as model_params_file: model_params_data = json.load(model_params_file) - + logger.info("Best model preprocessing step: {}".format(process)) - models_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "models")).inspect_directory() + # load the saved classifier clf = joblib.load(os.path.join(models_path, "model_grid_{}.pkl".format(process))) logger.info("Best model loaded.") - # inner with K-Fold cross-validation declaration + # inner K-Fold cross-validation declaration random_seed = None shuffle = config["k_fold_shuffle"] if shuffle is True: random_seed = config["seed"] elif shuffle is False: random_seed = None - print("Fitting the data to the classifier with K-Fold cross-validation..") logger.info("Fitting the data to the classifier with K-Fold cross-validation..") inner_cv = KFold(n_splits=n_fold, shuffle=shuffle, - random_state=random_seed - ) - logger.debug("Type of X: {}".format(type(X))) - logger.debug("Type of y: {}".format(type(y))) - # tracks_fold_indexing = [] - tracks_fold_indexing_dict = {} - print(tracks[0]) - print(tracks[4]) + random_state=random_seed) - # transformation of the data + # transformation of the data to proper features based on the preprocess step features_prepared = Transform(config=config, df_feats=X, process=process, train_class=class_name, exports_path=exports_path, log_level=log_level).post_processing() - logger.debug("features prepared shape: {}".format(features_prepared.shape)) + logger.debug("Features prepared shape: {}".format(features_prepared.shape)) + + # Starting Training, Predictions for each fold + logger.info("Starting fold-evaluation..") + predictions_df_list, accuracy_model, tracks_fold_indexing_dict = predictions_fold(clf=clf, + inner_cv=inner_cv, + feats_prepared=features_prepared, + y=y, + tracks=tracks, + class_name=class_name, + logger=logger) + + # concatenate the folded predictions DFs + df_predictions = create_dataset_predictions(list_df_predictions=predictions_df_list, + class_name=class_name, + dataset_path=dataset_path, + logger=logger) + + # ACCURACIES in each fold + export_accuracies(accuracy_model=accuracy_model, + config=config, + class_name=class_name, + exports_path=exports_path, + images_path=images_path, + logger=logger) + # Folded Tracks Dictionary + export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, + class_name=class_name, + dataset_path=dataset_path, + logger=logger) + + # Evaluation to the folded Dataset + export_evaluation_results(config=config, + set_name="Folded", + y_true_values=df_predictions[class_name], + predictions=df_predictions["predictions"], + class_name=class_name, + exports_path=exports_path, + logger=logger + ) + + # Train to the whole dataset + logger.info("Train the classifier with the whole dataset..") + clf.fit(features_prepared, y) + # prediction for the whole dataset + predictions_all = clf.predict(features_prepared) + # save the model that is trained to the whole dataset + best_model_path = os.path.join(exports_path, exports_dir, "best_clf_model.pkl") + joblib.dump(clf, best_model_path) + logger.info("Best model saved.") + # Evaluation to the whole Dataset + export_evaluation_results(config=config, + set_name="Whole", + y_true_values=y, + predictions=predictions_all, + class_name=class_name, + exports_path=exports_path, + logger=logger + ) + + +def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logger): + tracks_fold_indexing_dict = {} accuracy_model = [] predictions_df_list = [] fold_number = 0 - for train_index, test_index in inner_cv.split(features_prepared): - print("Fold: {}".format(fold_number)) - logger.info("FOLD: {}".format(fold_number)) - # print("TRAIN INDEX: ", train_index) - print("first test index element: {} - last test index element: {}".format(test_index[0], test_index[-1])) + for train_index, test_index in inner_cv.split(feats_prepared): + logger.info("FOLD {} - Analyzing, Fitting, Predicting".format(fold_number)) logger.debug("first test index element: {} - last test index element: {}".format(test_index[0], test_index[-1])) logger.debug("TEST INDEX: {}".format(test_index)) logger.debug("Length of the test index array: {}".format(len(test_index))) + # tracks indexing list for each fold tracks_count = 0 tracks_list = [] for index in test_index: - # print(tracks[index]) tracks_fold_indexing_dict[tracks[index]] = fold_number tracks_list.append(tracks[index]) tracks_count += 1 - print(colored("Tracks indexed to the specific fold: {}".format(tracks_count), "cyan")) - X_train, X_test = features_prepared[train_index], features_prepared[test_index] + logger.debug("Tracks indexed to the specific fold: {}".format(tracks_count)) + X_train, X_test = feats_prepared[train_index], feats_prepared[test_index] y_train, y_test = y[train_index], y[test_index] # Train the model - print("Fitting for fold {}".format(fold_number)) clf.fit(X_train, y_train) - logger.info("Classifier classes: {}".format(clf.classes_)) - # predictions - print("Predicting for the specific fold..") - logger.info("Predicting for the specific fold..") - logger.info("Predictions outputs") - pred = clf.predict(X_test) - logger.debug("predictions type after applying classifier's predict {}".format(type(pred))) - logger.debug("predictions shape: {}".format(pred.shape)) - df_pred = pd.DataFrame(data=pred, index=test_index, columns=["predictions"]) - logger.debug("Transforming to dataframe") - logger.debug("\n{}".format(df_pred.head())) - # predictions probabilities - logger.info("Predictions Probabilities outputs") - pred_prob = clf.predict_proba(X_test) - df_pred_prob = pd.DataFrame(data=pred_prob, index=test_index, columns=clf.classes_) - logger.debug("Transforming to dataframe") - logger.debug("\n{}".format(df_pred_prob.head())) - print("Tracks dataframe set..") - logger.info("Tracks dataframe set..") - # tracks df - df_tracks = pd.DataFrame(data=tracks_list, index=test_index, columns=["track"]) - logger.debug("\n{}".format(df_tracks.head())) - # y_test series - print("True values set..") - logger.info("True values set..") - logger.debug("Transforming to dataframe") - y_test_series = pd.DataFrame(data=y_test, index=test_index, columns=[class_name]) - logger.debug("\n{}".format(y_test_series.head())) - # concatenate dfs - logger.info("Concatenating DF..") - df_pred_general = pd.concat([df_tracks, df_pred_prob, df_pred, y_test_series], axis=1, ignore_index=False) - logger.debug("\n{}".format(df_pred_general.head())) - # predictions_all_df.append(df_pred_general, ignore_index=True) + logger.debug("Classifier classes: {}".format(clf.classes_)) + # create a df for this fold with the predictions + df_pred_general = fold_predictions(clf=clf, + class_name=class_name, + X_test=X_test, + test_index=test_index, + tracks_list=tracks_list, + y_test=y_test, + logger=logger) + # Append the folded dataset to a list that will contain all the folded datasets: predictions_df_list.append(df_pred_general) - # Append to accuracy_model the accuracy of the model + # Append each accuracy of the folded model to a list that contains all the accuracies resulted from each fold accuracy_model.append(accuracy_score(y_test, clf.predict(X_test), normalize=True) * 100) fold_number += 1 - print() - print() - # concatenate predictions dfs - print(colored("Make Predictions DataFrame for all the folded instances together..", "cyan")) - logger.info("Make Predictions DataFrame for all the folded instances together..") - df_predictions = pd.concat(predictions_df_list) - logger.debug("\n{}".format(df_predictions.head())) - logger.debug("Info:") - logger.debug("\n{}".format(df_predictions.info())) - # save predictions df - logger.info("Saving the unified predictions DataFrame locally.") - dataset_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "dataset")).inspect_directory() - df_predictions.to_csv(os.path.join(dataset_path, "predictions_{}.csv".format(class_name))) + return predictions_df_list, accuracy_model, tracks_fold_indexing_dict + + +def fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): + """ + Creates a pandas DataFrame from each fold with the predictions in + order later to extract the shuffled dataset with the tracks, the percentage + of the prediction probability for each class, the prediction, and the true + value. - # ACCURACIES - print(colored("Accuracies in each fold: {}".format(accuracy_model), "cyan")) - print(colored("Mean of accuracies: {}".format(np.mean(accuracy_model)), "cyan")) - print(colored("Standard Deviation of accuracies: {}".format(np.std(accuracy_model)), "cyan")) + Args: + clf: + class_name: + X_test: + test_index: + tracks_list: + y_test: + logger: + + Returns: + A pandas DataFrame with the predictions at each fold. + """ + # predictions for the features test + pred = clf.predict(X_test) + # predictions numpy array transformation to pandas DF + df_pred = pd.DataFrame(data=pred, index=test_index, columns=["predictions"]) + # predictions' probabilities + pred_prob = clf.predict_proba(X_test) + # predictions' probabilities numpy array transformation to pandas DF + df_pred_prob = pd.DataFrame(data=pred_prob, index=test_index, columns=clf.classes_) + # tracks list transformation to pandas DF + df_tracks = pd.DataFrame(data=tracks_list, index=test_index, columns=["track"]) + logger.debug("\n{}".format(df_tracks.head())) + # y_test pandas Series transformation to pandas DF + y_test_series = pd.DataFrame(data=y_test, index=test_index, columns=[class_name]) + # concatenate the 4 DFs above to 1 for saving the resulted dataset + # (tracks, predictions' probabilities, predictions, true) + logger.debug("Concatenating DF..") + df_pred_general = pd.concat([df_tracks, df_pred_prob, df_pred, y_test_series], axis=1, ignore_index=False) + + return df_pred_general + + +def export_accuracies(accuracy_model, config, class_name, exports_path, images_path, logger): logger.info("Accuracies in each fold: {}".format(accuracy_model)) logger.info("Mean of accuracies: {}".format(np.mean(accuracy_model))) logger.info("Standard Deviation of accuracies: {}".format(np.std(accuracy_model))) - accuracies_export = "Accuracies in each fold: {} \nMean of accuracies: {} \nStandard Deviation of accuracies: {}"\ + accuracies_export = "Accuracies in each fold: {} \nMean of accuracies: {} \nStandard Deviation of accuracies: {}" \ .format(accuracy_model, np.mean(accuracy_model), np.std(accuracy_model)) export_report(config=config, name="Accuracies results", @@ -172,108 +227,77 @@ def fold_evaluation(config, n_fold, X, y, class_name, tracks, process, exports_p train_class=class_name, exports_path=exports_path) - # Visualize accuracy for each iteration - logger.info("Visualize accuracy for each iteration..") + # Visualize accuracy for each iteration in a distribution plot + create_accuracies_dist_plot(accuracies_list=accuracy_model, + images_path=images_path, + logger=logger) + + +def create_dataset_predictions(list_df_predictions, class_name, dataset_path, logger): + logger.info("Make Predictions DataFrame for all the folded instances together.") + df_concat_predictions = pd.concat(list_df_predictions) + logger.debug("\n{}".format(df_concat_predictions.head())) + logger.debug("Info:") + logger.debug("\n{}".format(df_concat_predictions.info())) + # save predictions df + logger.info("Saving the unified predictions DataFrame locally.") + df_concat_predictions.to_csv(os.path.join(dataset_path, "predictions_{}.csv".format(class_name))) + + return df_concat_predictions + + +def create_accuracies_dist_plot(accuracies_list, images_path, logger): + logger.info("Visualize accuracy for each iteration.") list_folds = [] counter_folds = 0 - for accuracy in accuracy_model: + for accuracy in accuracies_list: list_folds.append("Fold{}".format(counter_folds)) counter_folds += 1 - print("Exporting accuracies distribution to plot file..") - logger.info("Exporting accuracies distribution to plot file..") - scores = pd.DataFrame(accuracy_model, columns=['Scores']) + logger.debug("Exporting accuracies distribution to plot file..") + scores = pd.DataFrame(accuracies_list, columns=['Scores']) sns.set(style="white", rc={"lines.linewidth": 3}) sns.barplot(x=list_folds, y="Scores", data=scores) - images_path = FindCreateDirectory(exports_path, - os.path.join(exports_dir, "images")).inspect_directory() plt.savefig(os.path.join(images_path, "accuracies_distribution.png")) sns.set() plt.close() logger.info("Plot saved successfully.") - # Folded Tracks Dictionary - print("Writing Folded Tracks Dictionary locally to check where each track is folded..") + +def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, logger): logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) - folded_dataset_path = os.path.join(dataset_path, "{}.yaml".format(class_name)) + folded_dataset_path = os.path.join(dataset_path, "{}.yaml".format(class_name)) with open(folded_dataset_path, 'w') as file: folded_dataset = yaml.dump(tracks_fold_indexing_dict, file) logger.info("Folded dataset written successfully to disk.") - # EVALUATION REPORTS - print(colored("Evaluation Reports", "cyan")) - logger.info("---- EVALUATION REPORTS ----") - - # Folded Dataset - print(colored("Evaluation to the folded dataset..", "cyan")) - logger.info("Evaluation to the folded dataset..") +def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path, logger): + logger.info("---- Evaluation to the {} dataset ----".format(set_name)) # Confusion Matrix - print("Exporting Confusion Matrix applied to the folded dataset..") - logger.info("Confusion Matrix applied to the folded dataset..") - cm = confusion_matrix(y_true=df_predictions[class_name], y_pred=df_predictions["predictions"]) + logger.info("Exporting Confusion Matrix applied to the {} dataset..".format(set_name)) + cm = confusion_matrix(y_true=y_true_values, y_pred=predictions) logger.info("\n{}".format(cm)) - # Confusion Matrix Normalized - print("Exporting Normalized Confusion Matrix applied to the folded dataset..") - logger.info("Normalized Confusion Matrix applied to the folded dataset..") + logger.info("Exporting Normalized Confusion Matrix applied to the {} dataset..".format(set_name)) cm_normalized = (cm / cm.astype(np.float).sum(axis=1) * 100) logger.info("\n{}".format(cm_normalized)) cm_all = "Actual instances\n{}\n\nNormalized\n{}".format(cm, cm_normalized) + # export the confusion matrix report for the folded dataset export_report(config=config, - name="Folded Data Confusion Matrix", + name="{} Data Confusion Matrix".format(set_name), report=cm_all, - filename="confusion_matrix_fold", + filename="confusion_matrix_{}".format(set_name), train_class=class_name, exports_path=exports_path) - # Classification Report - print("Exporting Classification Report applied to the folded dataset..") - logger.info("Classification Report applied to the folded dataset..") - cr = classification_report(y_true=df_predictions[class_name], y_pred=df_predictions["predictions"]) + logger.info("Exporting Classification Report applied to the {} dataset..".format(set_name)) + cr = classification_report(y_true=y_true_values, y_pred=predictions) + # export the Classification report for the whole dataset export_report(config=config, - name="Folded Data Classification Report", + name="{} Data Classification Report".format(set_name), report=cr, - filename="classification_report_fold", + filename="classification_report_{}".format(set_name), train_class=class_name, exports_path=exports_path) - - logger.info("The folded dataset has been evaluated successfully..") - print(colored("The folded dataset has been evaluated successfully..", "green")) - - # # save the model - # models_path = FindCreateDirectory(os.path.join(exports_path, "models")).inspect_directory() - # model_save_path = os.path.join(models_path, "model.pkl") - # joblib.dump(clf, model_save_path) - # - # train with all the data of the dataset - print(colored("Evaluation to the whole dataset..", "cyan")) - logger.info("Evaluation to the whole dataset..") - clf.fit(features_prepared, y) - predictions_proba_all = clf.predict_proba(features_prepared) - predictions_all = clf.predict(features_prepared) - logger.info("Confusion Matrix applied to the whole dataset..") - cm_full = confusion_matrix(y_true=y, y_pred=predictions_all) - logger.info("\n{}".format(cm_full)) - logger.info("Normalized Confusion Matrix applied to the whole dataset..") - cm_full_normalized = (cm_full / cm_full.astype(np.float).sum(axis=1) * 100) - logger.info("\n{}".format(cm_full_normalized)) - cm_full_all = "Actual instances\n{}\n\nNormalized\n{}".format(cm_full, cm_full_normalized) - export_report(config=config, - name="All Data Confusion Matrix", - report=cm_full_all, - filename="confusion_matrix_all_dataset", - train_class=class_name, - exports_path=exports_path) - logger.info("Classification Report applied to the whole dataset..") - cr_full = classification_report(y_true=y, y_pred=predictions_all) - export_report(config=config, - name="All Data Classification Report", - report=cr_full, - filename="classification_report_all_dataset", - train_class=class_name, - exports_path=exports_path) - - logger.info("The whole dataset has been evaluated successfully..") - print(colored("The whole dataset has been evaluated successfully..", "green")) - + logger.info("The {} dataset has been evaluated successfully.".format(set_name)) diff --git a/models/sklearn/classification/report_files_export.py b/models/sklearn/classification/report_files_export.py index d946dbc47..52c4fc51b 100644 --- a/models/sklearn/classification/report_files_export.py +++ b/models/sklearn/classification/report_files_export.py @@ -1,26 +1,22 @@ import os from datetime import datetime from termcolor import colored -from utils import load_yaml, FindCreateDirectory, TrainingProcesses + +from ..helper_functions.utils import FindCreateDirectory def export_report(config, name, report, filename, train_class, exports_path): - exports_dir = "{}_{}".format(config.get("exports_directory"), train_class) + exports_dir = config.get("exports_directory") reports_path = FindCreateDirectory(exports_path, os.path.join(exports_dir, "reports")).inspect_directory() - # take current date and convert to string + # take current datetime now = datetime.now() - datetime_str = now.strftime("%Y-%m-%d") - datetime_str_verbose = now.strftime("%Y-%m-%d, %H:%M:%S") + datetime_str_verbose = now.isoformat() print("Creating report file..") - with open(os.path.join(reports_path, "{}.txt".format(filename)), 'w+') as file: - file.write("{}".format(name)) - file.write('\n') - file.write('\n') - file.write(str(report)) - file.write('\n') - file.write('\n') - file.write('\n') - file.write("Date of execution: {}".format(datetime_str_verbose)) - file.close() - print(colored('{} file for class {} is created successfully.'.format(name, train_class), "cyan")) - + with open(os.path.join(reports_path, "{}.txt".format(filename)), 'w+') as fp: + fp.write("Date of execution: {}".format(datetime_str_verbose)) + fp.write("\n\n") + fp.write(str(name)) + fp.write("\n\n") + fp.write(str(report)) + fp.close() + print(colored("{} file for class {} is created successfully.".format(name, train_class), "cyan")) diff --git a/models/sklearn/classification/train_class.py b/models/sklearn/classification/train_class.py index 04d95b029..8e3f07d70 100644 --- a/models/sklearn/classification/train_class.py +++ b/models/sklearn/classification/train_class.py @@ -1,13 +1,14 @@ import os from termcolor import colored -from transformation.load_groung_truth import GroundTruthLoad -from classification.classification_task_manager import ClassificationTaskManager -from transformation.load_groung_truth import DatasetExporter import yaml -from logging_tool import LoggerSetup +from ..transformation.load_ground_truth import GroundTruthLoad +from ..classification.classification_task_manager import ClassificationTaskManager +from ..transformation.load_ground_truth import DatasetExporter +from ..helper_functions.logging_tool import LoggerSetup -def train_class(config, gt_file, log_level): + +def train_class(config, gt_file, exports_directory, log_level): exports_path = config["exports_path"] gt_data = GroundTruthLoad(config, gt_file, exports_path, log_level) # tracks shuffled and exported @@ -17,9 +18,16 @@ def train_class(config, gt_file, log_level): class_name = gt_data.export_train_class() config["class_name"] = class_name + # project directory where the models and outputs will be saved + if exports_directory is None: + prefix_exports_dir = "exports" + config["exports_directory"] = "{}_{}".format(prefix_exports_dir, class_name) + else: + config["exports_directory"] = exports_directory + logger = LoggerSetup(config=config, exports_path=exports_path, - name="train_class_{}".format(class_name), + name="train_model_{}".format(class_name), train_class=class_name, mode="w", level=log_level).setup_logger() @@ -28,8 +36,14 @@ def train_class(config, gt_file, log_level): logger.debug("Type of exported GT data exported: {}".format(type(tracks_listed_shuffled))) - # save project file - project_file_name_save = "{}_{}.yaml".format(config["project_file"], class_name) + # name the project file + if config["project_file"] is None: + prefix_project_file = "project" + project_file_name_save = "{}_{}.yaml".format(prefix_project_file, class_name) + else: + project_file_name_save = "{}.yaml".format(config["project_file"]) + logger.info("Project yaml file name: {}".format(project_file_name_save)) + # save the project file project_file_save_path = os.path.join(exports_path, project_file_name_save) with open(os.path.join(project_file_save_path), "w") as template_file: template_data_write = yaml.dump(config, template_file) @@ -48,14 +62,6 @@ def train_class(config, gt_file, log_level): logger.debug("Type of labels: {}".format(type(labels))) logger.debug("Type of Tracks: {}".format(type(tracks))) - print(colored("Small previews:", "cyan")) - print(colored("FEATURES", "magenta")) - print(features.head(3)) - print(colored("LABELS", "magenta")) - print(labels[:10]) - print(colored("TRACKS:", "magenta")) - print(tracks[:10]) - model_manage = ClassificationTaskManager(config=config, train_class=class_name, X=features, @@ -64,5 +70,5 @@ def train_class(config, gt_file, log_level): exports_path=exports_path, log_level=log_level) classification_time = model_manage.apply_processing() - print(colored("Classification ended in {} minutes.".format(classification_time), "green")) - logger.info("Classification ended in {} minutes.".format(classification_time)) + print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) + logger.info("Classification ended successfully in {} minutes.".format(classification_time)) diff --git a/models/sklearn/create_classification_project.py b/models/sklearn/create_classification_project.py deleted file mode 100644 index cb9ab4e04..000000000 --- a/models/sklearn/create_classification_project.py +++ /dev/null @@ -1,129 +0,0 @@ -import os -import argparse -from pprint import pprint -from utils import load_yaml -import yaml -import time -from transformation.load_groung_truth import ListGroundTruthFiles -from classification.train_class import train_class - - -def create_classification_project(ground_truth_directory, class_dir, project_file, exports_directory, logging, seed, jobs, verbose, exports_path): - """ - - :param ground_truth_directory: - :param class_dir: - :param project_file: - :param exports_directory: - :param logging: - :param seed: - :param jobs: - :param verbose: - :param exports_path: - :return: - """ - try: - project_template = load_yaml("configuration_template.yaml") - except Exception as e: - print('Unable to open project configuration template:', e) - raise - - # print("BEFORE:") - # print("Type of congig template:", type(project_template)) - print("-------------------------------------------------------") - print() - if seed is None: - seed = time.time() - - print("Seed argument: {}".format(seed)) - - project_template["ground_truth_directory"] = ground_truth_directory - project_template["class_dir"] = class_dir - project_template["project_file"] = project_file - project_template["exports_directory"] = exports_directory - project_template["logging_level"] = logging - project_template["seed"] = seed - project_template["parallel_jobs"] = jobs - project_template["verbose"] = verbose - - # if empty, path is declared as the app's main directory - if exports_path is None: - exports_path = os.getcwd() - - print("Exports path: {}".format(exports_path)) - project_template["exports_path"] = exports_path - - print() - print() - print("-------------------------------------------------------") - # print("AFTER:") - # pprint(project_template) - - gt_files_list = ListGroundTruthFiles(project_template).list_gt_filenames() - print(gt_files_list) - print("LOAD GROUND TRUTH") - for gt_file in gt_files_list: - train_class(project_template, gt_file, logging) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Generates a project configuration file given a filelist, a groundtruth file, ' - 'and the directories to store the datasets and the results files. ' - 'The script has a parameter to specify the project template to use. ' - 'If it is not specified, it will try to guess the appropriated one from the ' - 'essentia version found on the descriptor files.') - - parser.add_argument('-g', '--groundtruth', - dest="ground_truth_directory", - default="datasets", - help='Name of the directory containing the datasets.') - - parser.add_argument('-c', '--classdir', - dest="class_dir", - help='Name of the directory containing the class or classes to train.', - required=True) - - parser.add_argument('-f', '--file', - dest="project_file", - default="project", - help='Name prefix of the project configuration file (.yaml) will be stored.') - - parser.add_argument('-e', '--exportsdir', - dest="exports_directory", - default="exports", - help='Path the exports of the project will be stored.') - - parser.add_argument('-l', '--logging', - default=1, - help='Path where the result files will be stored.', - type=int) - - parser.add_argument('-s', '--seed', - default=None, - help='Seed used to generate the shuffled dataset applied later to folding.', - type=int) - - parser.add_argument('-j', '--jobs', - default=-1, - help='Parallel jobs. Set to -1 to use all the available cores', - type=int) - parser.add_argument('-v', '--verbose', - default=1, - help="Controls the verbosity: the higher, the more messages.", - type=int) - parser.add_argument('-p', '--path', - dest='exports_path', - help='Path where the project results will be stored. If empty, the results will be saved in ' - 'app directory') - - # parser.add_argument('-t', '--template', - # default=None, - # help='classification project template file to use. ' - # 'If not specified, the script will try to detect it from the descriptors metadata.') - - args = parser.parse_args() - - create_classification_project(args.ground_truth_directory, args.class_dir, args.project_file, - args.exports_directory, logging=args.logging, seed=args.seed, jobs=args.jobs, - verbose=args.verbose, exports_path=args.exports_path) diff --git a/models/sklearn/gaia_best_models/jmp_results_danceability.param b/models/sklearn/gaia_best_models/jmp_results_danceability.param deleted file mode 100644 index c2b7fdb37..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_danceability.param +++ /dev/null @@ -1,11 +0,0 @@ -evaluation: - nfold: 5 - type: nfoldcrossvalidation -model: - C: 5 - balanceClasses: false - classifier: svm - gamma: -9 - kernel: RBF - preprocessing: gaussianized - type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_danceability.results.html b/models/sklearn/gaia_best_models/jmp_results_danceability.results.html deleted file mode 100644 index 372493c7c..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_danceability.results.html +++ /dev/null @@ -1,7 +0,0 @@ -

test_danceability (/data/project_danceability.yaml)

-Accuracy: 93.3333333333. Std: 3.00032966083. -Normalized accuracy: 92.6993290685. Normalized std: 2.83367775302. - - - -

Predicted (%)

danceablenot_danceableProportion
danceable95.14 137 danceable (out of 144) classified as danceable4.86 7 danceable (out of 144) classified as not_danceabledanceable64.00 %
not_danceable9.88 8 not_danceable (out of 81) classified as danceable90.12 73 not_danceable (out of 81) classified as not_danceablenot_danceable36.00 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param deleted file mode 100644 index 47c151556..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.param +++ /dev/null @@ -1,11 +0,0 @@ -evaluation: - nfold: 5 - type: nfoldcrossvalidation -model: - C: 7 - balanceClasses: true - classifier: svm - gamma: -9 - kernel: RBF - preprocessing: basic - type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html b/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html deleted file mode 100644 index ddf60ba27..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_tonal_atonal.results.html +++ /dev/null @@ -1,7 +0,0 @@ -

test_tonal_atonal (/data/project_tonal_atonal.yaml)

-Accuracy: 97.9651162791. Std: 1.48308195928. -Normalized accuracy: 97.775862069. Normalized std: 1.65925323088. - - - -

Predicted (%)

atonaltonalProportion
atonal96.55 140 atonal (out of 145) classified as atonal3.45 5 atonal (out of 145) classified as tonalatonal42.15 %
tonal1.01 2 tonal (out of 199) classified as atonal98.99 197 tonal (out of 199) classified as tonaltonal57.85 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param deleted file mode 100644 index 1dff19f60..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.param +++ /dev/null @@ -1,11 +0,0 @@ -evaluation: - nfold: 5 - type: nfoldcrossvalidation -model: - C: 9 - balanceClasses: false - classifier: svm - gamma: -11 - kernel: RBF - preprocessing: gaussianized - type: C-SVC diff --git a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html b/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html deleted file mode 100644 index 5031b91e2..000000000 --- a/models/sklearn/gaia_best_models/jmp_results_voice_instrumental.results.html +++ /dev/null @@ -1,7 +0,0 @@ -

test_voice_instrumental (/data/project_voice_instrumental.yaml)

-Accuracy: 93.2. Std: 1.72046505341. -Normalized accuracy: 93.2. Normalized std: 1.72046505341. - - - -

Predicted (%)

instrumentalvoiceProportion
instrumental93.20 466 instrumental (out of 500) classified as instrumental6.80 34 instrumental (out of 500) classified as voiceinstrumental50.00 %
voice6.80 34 voice (out of 500) classified as instrumental93.20 466 voice (out of 500) classified as voicevoice50.00 %

Actual (%)


\ No newline at end of file diff --git a/models/sklearn/gaia_imitation_best_model.py b/models/sklearn/gaia_imitation_best_model.py deleted file mode 100644 index 8f9943f33..000000000 --- a/models/sklearn/gaia_imitation_best_model.py +++ /dev/null @@ -1,103 +0,0 @@ -from utils import load_yaml, FindCreateDirectory -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import cross_validate -from sklearn.model_selection import cross_val_predict -from transformation.transform import Transform -from sklearn.model_selection import KFold -from sklearn.svm import SVC - - -def display_scores(scores): - """ - - :param scores: - :return: - """ - print("Display scores:") - print("Scores: {}".format(scores)) - print("Mean: {}".format(scores.mean())) - print("Standard Deviation: {}".format(scores.std())) - - -def evaluate_gaia_imitation_model(config, class_name, X, y): - """ - - :param config: - :param class_name: - :param X: - :param y: - :return: - """ - gaia_params = load_yaml("gaia_best_models/jmp_results_{}.param".format(class_name)) - print("Gaia best model params: {}".format(gaia_params)) - - # params data transformation - preprocessing = gaia_params["model"]["preprocessing"] - - # params SVC - C = 2 ** gaia_params["model"]["C"] - gamma = 2 ** gaia_params["model"]["gamma"] - kernel = gaia_params["model"]["kernel"].lower() - balance_classes = gaia_params["model"]["balanceClasses"] - # TODO: declare a dictionary for class weights via automated labels balancing (unresponsive dataset) - if balance_classes is True: - class_weights = "balanced" - elif balance_classes is False: - class_weights = None - else: - print("Define a correct class weight value") - class_weights = None - n_fold = gaia_params["evaluation"]["nfold"] - - # Transform dataset - # pre-processing: data cleaning/enumerating/selecting descriptors - # pre-processing: scaling - print("Exports path for the training:") - exports_dir = "{}_{}".format(config.get("exports_directory"), class_name) - exports_path = FindCreateDirectory(exports_dir).inspect_directory() - print(exports_path) - # transformation of the data - X_transformed = Transform(config=config, - df=X, - process=preprocessing, - exports_path=exports_path, - mode="train").post_processing() - - print(X_transformed.columns) - print(X_transformed.head()) - - X_array_transformed = X_transformed.values - - inner_cv = KFold(n_splits=n_fold, - shuffle=config["gaia_kfold_shuffle"], - random_state=config["gaia_kfold_random_state"] - ) - - svm = SVC( - C=C, - kernel=kernel, - gamma=gamma, - class_weight=class_weights, - probability=config.get("svc_probability") - ) - - print("Evaluate the classifier with cross_val_score:") - scores = cross_val_score(estimator=svm, - X=X_array_transformed, - y=y, - scoring="accuracy", - cv=inner_cv, - n_jobs=config.get("parallel_jobs"), - verbose=config.get("verbose") - ) - - print() - print("Score results:") - display_scores(scores) - print() - print() - - -if __name__ == '__main__': - - evaluate_gaia_imitation_model() diff --git a/models/sklearn/helper_functions/__init__.py b/models/sklearn/helper_functions/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/models/sklearn/helper_functions/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/models/sklearn/logging_tool.py b/models/sklearn/helper_functions/logging_tool.py similarity index 75% rename from models/sklearn/logging_tool.py rename to models/sklearn/helper_functions/logging_tool.py index 3b2a87de9..d11040467 100644 --- a/models/sklearn/logging_tool.py +++ b/models/sklearn/helper_functions/logging_tool.py @@ -10,7 +10,7 @@ """ import logging import os -from utils import load_yaml, FindCreateDirectory +from ..helper_functions.utils import FindCreateDirectory # # load yaml configuration file to a dict # config_data = load_yaml() @@ -37,9 +37,12 @@ def __init__(self, config, exports_path, name, train_class, mode, level=1): Inits the logger object with the corresponding parameters. Args: - name (str): The name of the logger. - log_file (str): The path the logging exports will be exported. - level (int): The level of the logging. Defaults to 1. + config: The configuration data (dict). + exports_path: The path (str) the logging exports will be exported. + name: The name (str) of the logger. + train_class: The name of the target class (str) + level: The level (int) of the logging. Defaults to 1. + mode: The mode (str) translated in write, append. Valid values ("w", "a") """ self.config = config self.exports_path = exports_path @@ -56,9 +59,10 @@ def setup_logger(self): Function to set up as many loggers as you want. It exports the logging results to a file in the relevant path that is determined by the configuration file. - :return: + Returns: + The logger object. """ - self.exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + self.exports_dir = self.config.get("exports_directory") self.logs_path = FindCreateDirectory(self.exports_path, os.path.join(self.exports_dir, "logs")).inspect_directory() @@ -86,22 +90,22 @@ def setup_logger(self): if self.level is None: logger_object.setLevel(logging.INFO) - elif self.level is 0: + elif self.level == "logging.DEBUG": logger_object.setLevel(logging.DEBUG) - elif self.level is 1: + elif self.level == "logging.INFO": logger_object.setLevel(logging.INFO) - elif self.level is 2: + elif self.level == "logging.WARNING": logger_object.setLevel(logging.WARNING) - elif self.level is 3: + elif self.level == "logging.ERROR": logger_object.setLevel(logging.ERROR) - elif self.level is 4: + elif self.level == "logging.CRITICAL": logger_object.setLevel(logging.CRITICAL) else: - print('Please define correct one of the Debug Levels:\n' - '0: DEBUG\n' - '1: INFO\n' - '2: WARNING\n' - '3: ERROR\n' - '4: CRITICAL') + print("Please define correct one of the Debug Levels:\n" + "logging.DEBUG: DEBUG\n" + "logging.INFO: INFO\n" + "logging.WARNING: WARNING\n" + "logging.ERROR: ERROR\n" + "logging.CRITICAL: CRITICAL") return logger_object diff --git a/models/sklearn/utils.py b/models/sklearn/helper_functions/utils.py similarity index 54% rename from models/sklearn/utils.py rename to models/sklearn/helper_functions/utils.py index 932eaa02d..7cca3b1ba 100644 --- a/models/sklearn/utils.py +++ b/models/sklearn/helper_functions/utils.py @@ -1,23 +1,24 @@ import os -from pprint import pprint +import yaml -def load_yaml(path_file): + +def load_yaml(path_to_file, file): """ - Todo: add comments, docstring info, etc. - :return: + Args: + path_file: + + Returns: + The configuration data loaded from the template. """ try: - import yaml - with open(os.path.join(os.path.abspath(os.getcwd()), path_file)) as file: - config_data = yaml.load(file, Loader=yaml.FullLoader) - # print(type(config_data)) - # print(config_data) + with open(os.path.join(path_to_file, file)) as fp: + config_data = yaml.load(fp, Loader=yaml.FullLoader) if isinstance(config_data, dict): return config_data else: return None except ImportError: - print('WARNING: could not import yaml module') + print("WARNING: could not import yaml module") return None @@ -27,54 +28,39 @@ class DfChecker: """ def __init__(self, df_check): """ - - :param df_check: + Args: + df_check: """ self.df_check = df_check def check_df_info(self): """ Prints information about the Pandas DataFrame that is generated from the relevant process. - :return: """ - print('Features DataFrame head:') + print("Features DataFrame head:") print(self.df_check.head()) print() - print('Information:') + print("Information:") print(self.df_check.info()) print() - print('Shape:', self.df_check.shape) - print('Number of columns:', len(list(self.df_check.columns))) + print("Shape:", self.df_check.shape) + print("Number of columns:", len(list(self.df_check.columns))) - if 'category' in self.df_check.columns: - print('Track categories distribution:') - print(self.df_check['category'].value_counts()) + if "category" in self.df_check.columns: + print("Track categories distribution:") + print(self.df_check["category"].value_counts()) class FindCreateDirectory: - """ - - """ def __init__(self, exports_path, directory): - """ - - :param directory: - """ self.exports_path = exports_path self.directory = directory def inspect_directory(self): - """ - - :return: - """ # find dynamically the current script directory - # path_app = os.path.join(os.path.abspath(os.getcwd())) full_path = os.path.join(self.exports_path, self.directory) # create path directories if not exist --> else return the path - if not os.path.exists(full_path): - os.makedirs(full_path) - # print('Path {}:'.format(self.directory), full_path) + os.makedirs(full_path, exist_ok=True) return full_path @@ -98,7 +84,17 @@ def delete_logs(self): print("Evaluation logs deletion is turned to OFF.") -def change_weights_values(i): +def change_weights_val(i): + """ + Is is used in the TrainingProcesses class. It is used to transform each value of + the balanced classes list in the configuration file Grid parameters of the classifier: + * True --> balanced + * False --> None + Args: + i: The value inserted + Returns: + "balanced" in case the value of the list is True, else None if it is set to False. + """ if i is True: return "balanced" elif i is False: @@ -107,14 +103,22 @@ def change_weights_values(i): class TrainingProcesses: + """ + Extracts the pre-processing steps that are specified in "List of classifiers + to be trained" section of the configuration template. These are the amount + of the prep-processing steps with the relevant training that will be executed. + """ def __init__(self, config): + """ + Args: + config: The configuration data. + """ self.config = config def training_processes(self): """ - - :return: - processes: A list of the processes that have been identified with the corresponding parameter grid + Returns: + A list of the processes that have been identified with the corresponding parameter grid. """ evaluations = self.config["evaluations"]["nfoldcrossvalidation"] print("Evaluations countered: {}".format(len(evaluations))) @@ -128,27 +132,18 @@ def training_processes(self): for pre_processing in classifier["preprocessing"]: for clf_type in classifier["type"]: if clf_type == "C-SVC": - process_dict = dict() - process_dict["Evaluation"] = evaluation_counter - # classifier - process_dict["classifier"] = clf_type - # pre-processing - process_dict["preprocess"] = pre_processing - # kernel - kernel = classifier["kernel"] - process_dict["kernel"] = [i.lower() for i in kernel] - # C - c = classifier["C"] - process_dict["C"] = [2 ** x for x in c] # 2 ** c - # gamma - gamma = classifier["gamma"] - process_dict["gamma"] = [2 ** x for x in gamma] # 2 ** gamma - # class weights - balance_classes = classifier["balanceClasses"] - process_dict["balanceClasses"] = [change_weights_values(i) for i in balance_classes] + process_dict = { + "evaluation": evaluation_counter, + "classifier": clf_type, + "preprocess": pre_processing, + "kernel": [i.lower() for i in classifier["kernel"]], # lowercase the values + "C": [2 ** x for x in classifier["C"]], # 2 ** c + "gamma": [2 ** x for x in classifier["gamma"]], # 2 ** gamma + "balance_classes": [change_weights_val(i) for i in classifier["balance_classes"]], + "n_fold": nfold_number + } + # append the pre-processing steps list processes.append(process_dict) - # n_fold - process_dict["n_fold"] = nfold_number # increase counter by 1 trainings_counted += 1 # increase evaluation counter by 1 @@ -157,10 +152,3 @@ def training_processes(self): print("Trainings to be applied: {}".format(trainings_counted)) return processes - - -if __name__ == '__main__': - conf_data = load_yaml() - print(conf_data) - - test = FindCreateDirectory('exports').inspect_directory() diff --git a/models/sklearn/model/__init__.py b/models/sklearn/model/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/models/sklearn/model/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/models/sklearn/model/classification_project.py b/models/sklearn/model/classification_project.py new file mode 100644 index 000000000..59a0b8d9b --- /dev/null +++ b/models/sklearn/model/classification_project.py @@ -0,0 +1,138 @@ +import os +import argparse +from ..helper_functions.utils import load_yaml +import time +from ..transformation.load_ground_truth import ListGroundTruthFiles +from ..classification.train_class import train_class + + +def create_classification_project(ground_truth_directory, project_file=None, exports_directory=None, exports_path=None, + seed=None, jobs=-1, verbose=1, logging="logging.INFO"): + """ + Args: + ground_truth_directory: The path (str) to the dataset directory where the + groundtruth yaml file is located. It is required. + project_file: The name (str) of the project configuration yaml file that + will be created. Default: None. If None, the tool will create + automatically a project file name in form of "project_CLASS_NAME", + where CLASS_NAME is the target class as referred to the groundtruth data. + exports_directory: The name (str) of the directory that the results + of the classification project will be save to. Default: None. If None, + the tool will automatically create a directory with the name + "exports_CLASS_NAME", where CLASS_NAME is the target class as referred + to the groundtruth data. + exports_path: The path (str) to the exports directory. Default: None. If + None, the exports directory will be saved inside the app folder. + seed: The seed (int) of the random shuffle generator. Default: 1 + jobs: The cores (int) that will be exploited during the training phase. + Default: -1. If -1, all the available cores will be used. + verbose: The verbosity (int) of the printed messages where this function + is available (for example in sklearn's GridSearch algorithm). Default: 1. + The higher the number the higher the verbosity. + logging: The level (str) of the logging prints. Default: "logging.INFO". + Available values: logging.DEBUG, logging.INFO, logging.WARNING, + logging.ERROR, logging.CRITICAL. + """ + try: + path_template = os.path.dirname(os.path.realpath(__file__)) + project_template = load_yaml(path_template, "configuration_template.yaml") + except Exception as e: + print('Unable to open project configuration template:', e) + raise + + print("-------------------------------------------------------") + print() + if seed is None: + seed = time.time() + + print("Seed argument: {}".format(seed)) + + project_template["ground_truth_directory"] = ground_truth_directory + project_template["project_file"] = project_file + project_template["logging_level"] = logging + project_template["seed"] = seed + project_template["parallel_jobs"] = jobs + project_template["verbose"] = verbose + + # if empty, path is declared as the app's main directory + if exports_path is None: + exports_path = os.getcwd() + + print("Exports path: {}".format(exports_path)) + project_template["exports_path"] = exports_path + + print("Exports directory: {}".format(exports_directory)) + + print() + print() + print("-------------------------------------------------------") + # print("AFTER:") + # pprint(project_template) + + gt_files_list = ListGroundTruthFiles(project_template).list_gt_filenames() + print("List GroundTruth yaml files found:") + print(gt_files_list) + print("LOAD GROUND TRUTH") + for gt_file in gt_files_list: + train_class(project_template, gt_file, exports_directory, logging) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generates a project configuration file given a filelist, a groundtruth file, ' + 'and the directories to store the datasets and the results files. ' + 'The script has a parameter to specify the project template to use. ' + 'If it is not specified, it will try to guess the appropriated one from the ' + 'essentia version found on the descriptor files.') + + parser.add_argument("-g", "--groundtruth", + dest="ground_truth_directory", + default="datasets", + help="Path of the main dataset directory containing the groundtruth file/s.", + required=True) + + parser.add_argument("-f", "--file", + dest="project_file", + help="Name of the project configuration file (.yaml) will be stored. If not specified " + "it takes automatically the name .") + + parser.add_argument("-d", "--exportsdir", + dest="exports_directory", + help="Name of the exports directory that the project's results will be stored.") + + parser.add_argument("-p", "--path", + dest="exports_path", + help="Path where the project results will be stored. If empty, the results will be saved in " + "the main app directory.") + + parser.add_argument("-s", "--seed", + default=None, + help="Seed is used to generate the random shuffled dataset applied later to folding.", + type=int) + + parser.add_argument("-j", "--jobs", + default=-1, + help="Parallel jobs. Set to -1 to use all the available cores", + type=int) + + parser.add_argument("-v", "--verbose", + default=1, + help="Controls the verbosity: the higher, the more messages.", + type=int) + + parser.add_argument("-l", "--logging", + default="logging.INFO", + help="The logging level that will be printed logging.DEBUG, logging.INFO, logging.WARNING, " + "logging.ERROR, logging.CRITICAL).", + type=str) + + args = parser.parse_args() + + create_classification_project(ground_truth_directory=args.ground_truth_directory, + project_file=args.project_file, + exports_directory=args.exports_directory, + exports_path=args.exports_path, + seed=args.seed, + jobs=args.jobs, + verbose=args.verbose, + logging=args.logging) diff --git a/models/sklearn/configuration_template.yaml b/models/sklearn/model/configuration_template.yaml similarity index 88% rename from models/sklearn/configuration_template.yaml rename to models/sklearn/model/configuration_template.yaml index 036efe802..adce65740 100644 --- a/models/sklearn/configuration_template.yaml +++ b/models/sklearn/model/configuration_template.yaml @@ -2,10 +2,9 @@ # the ground truth data directory ground_truth_directory: exports_path: -# classes with features locally: c, gender, genre_rosamerica, moods_claurier, moods_mirex, timbre_bright_dark +# classes with features locally: danceability, gender, genre_rosamerica, moods_claurier, moods_mirex, timbre_bright_dark # classes with features locally: tonal_atonal, voice_instrumental # classes with features online: genre_dortmund, genre_electronic, genre_tzanetakis, ismir04_rhythm, -class_dir: class_name: exports_directory: logging_level: # logging level @@ -66,14 +65,14 @@ processing: - transfo: gaussianize # QuantileTransformer params: { descriptorNames: ['lowlevel.*'] } - mfcc: - # an MFCC only baseline - - transfo: remove - params: { descriptorNames: *unusedDescs } - - transfo: enumerate - params: { descriptorNames: *stringDescs } - - transfo: select - params: { descriptorNames: ['lowlevel.mfcc*'] } +# mfcc: +# # an MFCC only baseline +# - transfo: remove +# params: { descriptorNames: *unusedDescs } +# - transfo: enumerate +# params: { descriptorNames: *stringDescs } +# - transfo: select +# params: { descriptorNames: ['lowlevel.mfcc*'] } ## ML SETTINGS # train kind: grid, svm, deep_learning, supervised_lb @@ -93,14 +92,14 @@ verbose: # 0: no verbose, 1: simple information about the tasks completed, 2: classifiers: svm: # first svm test combinations - - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized', 'mfcc' ] -# - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized'] +# - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized', 'mfcc' ] + - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized'] type: [ 'C-SVC' ] kernel: [ 'poly', 'RBF' ] C: [ -5, -3, -1, 1, 3, 5, 7, 9, 11 ] # will actually be 2**x gamma: [ 3, 1, -1, -3, -5, -7, -9, -11 ] # will actually be 2**x # if True, weight classes based on the number of elements - balanceClasses: [False, True] + balance_classes: [False, True] # descriptorNames: [ ['*.mean', '*.var'] ] # more svm params combinations # ... diff --git a/models/sklearn/predict.py b/models/sklearn/model/predict.py similarity index 64% rename from models/sklearn/predict.py rename to models/sklearn/model/predict.py index 2ecbd6e73..c34e4f0a4 100644 --- a/models/sklearn/predict.py +++ b/models/sklearn/model/predict.py @@ -5,10 +5,10 @@ import joblib import json import pandas as pd -from utils import load_yaml, FindCreateDirectory -from transformation.utils_preprocessing import flatten_dict_full -from transformation.transform_predictions import TransformPredictions -from logging_tool import LoggerSetup +from ..helper_functions.utils import load_yaml, FindCreateDirectory +from ..transformation.utils_preprocessing import flatten_dict_full +from ..transformation.transform_predictions import TransformPredictions +from ..helper_functions.logging_tool import LoggerSetup class Predict: @@ -33,7 +33,7 @@ def __init__(self, config, track_low_level, log_level): def load_best_model(self): self.class_name = self.config["class_name"] self.exports_path = self.config["exports_path"] - self.exports_dir = "{}_{}".format(self.config["exports_directory"], self.class_name) + self.exports_dir = self.config["exports_directory"] # self.exports_path = os.path.join(self.exports_path, "{}_{}".format(self.exports_dir, self.class_name)) best_model_path = os.path.join(self.exports_path, @@ -44,7 +44,6 @@ def load_best_model(self): self.best_model = json.load(json_file) def preprocessing(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, name="predict_{}".format(self.class_name), @@ -81,9 +80,15 @@ def preprocessing(self): log_level=self.log_level ).post_processing() self.logger.debug("Features shape after preparation: {}".format(features_prepared.shape)) - models_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "models")).inspect_directory() - best_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(self.best_model["preprocessing"])) + + # load the best grid model that is trained with a k-fold cross validation + # models_path = FindCreateDirectory(self.exports_path, + # os.path.join(self.exports_dir, "models")).inspect_directory() + # best_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(self.best_model["preprocessing"])) + + # load the best model that is trained to the whole dataset + models_path = FindCreateDirectory(self.exports_path, self.exports_dir).inspect_directory() + best_model_path = os.path.join(models_path, "best_clf_model.pkl") clf_loaded = joblib.load(best_model_path) predicted = clf_loaded.predict(features_prepared) predicted_prob = clf_loaded.predict_proba(features_prepared) @@ -106,28 +111,26 @@ def preprocessing(self): return predict_list -def prediction(exports_path, project_file, track_api, log_level): +def prediction(exports_path, project_file, mbid, log_level="logging.INFO"): # if empty, path is declared as the app's main directory - if exports_path is None: - exports_path = os.getcwd() try: - project_data = load_yaml("{}.yaml".format(project_file)) + project_data = load_yaml(exports_path, "{}.yaml".format(project_file)) except Exception as e: print('Unable to open project configuration file:', e) raise - response = requests.get(track_api) - - track = response.json() - if track["metadata"]["tags"]["artist"][0]: - print("Artist:", track["metadata"]["tags"]["artist"][0]) - if track["metadata"]["tags"]["album"][0]: - print("Track:", track["metadata"]["tags"]["album"][0]) - if track["metadata"]["tags"]["title"][0]: - print("Track:", track["metadata"]["tags"]["album"][0]) + url_api = "https://acousticbrainz.org/api/v1/{}/low-level".format(mbid) + response = requests.get(url=url_api) + track_low_level_data = response.json() + if track_low_level_data["metadata"]["tags"]["artist"][0]: + print("Artist:", track_low_level_data["metadata"]["tags"]["artist"][0]) + if track_low_level_data["metadata"]["tags"]["album"][0]: + print("Album:", track_low_level_data["metadata"]["tags"]["album"][0]) + if track_low_level_data["metadata"]["tags"]["title"][0]: + print("Title:", track_low_level_data["metadata"]["tags"]["title"][0]) prediction_track = Predict(config=project_data, - track_low_level=track, + track_low_level=track_low_level_data, log_level=log_level ) prediction_track.preprocessing() @@ -136,31 +139,34 @@ def prediction(exports_path, project_file, track_api, log_level): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Predictions.') + description='Prediction of a track.') - parser.add_argument('-p', '--path', + parser.add_argument("-p", "--path", dest="exports_path", - help='Path where the project file is stored if not in the same file where the app is.') + help="Path where the project file (.yaml) is stored.", + required=True) - parser.add_argument('-f', '--file', + parser.add_argument("-f", "--file", dest="project_file", - help='Name prefix of the project configuration file (.yaml) that is stored.', + help="Name of the project configuration file (.yaml) that is to be loaded. The .yaml at the" + "end of the file is not necessary. Just put the name of the file.", required=True) - parser.add_argument('-t', '--track', - dest="track_api", - help='Low-level data link from the AcousticBrainz API.', + parser.add_argument("-t", "--track", + dest="mbid", + help="MBID of the the low-level data from the AcousticBrainz API.", required=True) - parser.add_argument('-l', '--logging', - dest='log_level', - default=1, - help='Path where the result files will be stored.', - type=int) + parser.add_argument("-l", "--logging", + dest="log_level", + default="logging.INFO", + help="The logging level that will be printed logging.DEBUG, logging.INFO, logging.WARNING, " + "logging.ERROR, logging.CRITICAL).", + type=str) args = parser.parse_args() prediction(exports_path=args.exports_path, project_file=args.project_file, - track_api=args.track_api, + mbid=args.mbid, log_level=args.log_level) diff --git a/models/sklearn/REQUIREMENTS.txt b/models/sklearn/requirements.txt similarity index 76% rename from models/sklearn/REQUIREMENTS.txt rename to models/sklearn/requirements.txt index be7c6de28..62242336b 100644 --- a/models/sklearn/REQUIREMENTS.txt +++ b/models/sklearn/requirements.txt @@ -1,4 +1,3 @@ -jupyter==1.0.0 matplotlib==3.1.3 numpy==1.18.1 pandas==1.0.3 @@ -6,8 +5,9 @@ PyYAML==5.3 scikit-learn==0.23.1 scipy==1.4.1 seaborn==0.10.0 -tensorflow==2.1.0 dask==2.11.0 dotty-dict==1.2.1 termcolor==1.1.0 -joblib==0.15.1 \ No newline at end of file +joblib==0.15.1 +six==1.15.0 +requests==2.23.0 \ No newline at end of file diff --git a/models/sklearn/transformation/__init__.py b/models/sklearn/transformation/__init__.py index 7c68785e9..40a96afc6 100644 --- a/models/sklearn/transformation/__init__.py +++ b/models/sklearn/transformation/__init__.py @@ -1 +1 @@ -# -*- coding: utf-8 -*- \ No newline at end of file +# -*- coding: utf-8 -*- diff --git a/models/sklearn/transformation/load_groung_truth.py b/models/sklearn/transformation/load_ground_truth.py similarity index 59% rename from models/sklearn/transformation/load_groung_truth.py rename to models/sklearn/transformation/load_ground_truth.py index fdd589551..c26b596d1 100644 --- a/models/sklearn/transformation/load_groung_truth.py +++ b/models/sklearn/transformation/load_ground_truth.py @@ -4,50 +4,51 @@ from pprint import pprint from termcolor import colored import random -from utils import load_yaml, FindCreateDirectory -from transformation.load_low_level import FeaturesDf -from logging_tool import LoggerSetup +from ..helper_functions.utils import load_yaml, FindCreateDirectory +from ..transformation.load_low_level import FeaturesDf +from ..helper_functions.logging_tool import LoggerSetup class ListGroundTruthFiles: """ - + Lists the groundtruth yaml files that are detected in a folder specified in + the configuration file. The yaml files contain the target class and the tracks + to be analyzed. """ def __init__(self, config): """ - - :param config: + Args: + config: The configuration data """ self.config = config self.dataset_dir = "" - self.class_dir = "" def list_gt_filenames(self): """ - - :return: + Returns: + A list of the groundtruth detected yaml files. """ self.dataset_dir = self.config.get("ground_truth_directory") - self.class_dir = self.config.get("class_dir") - path = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "metadata") - ground_truth_list = [filename for filename in os.listdir(os.path.join(path)) - if filename.startswith("groundtruth")] + ground_truth_list = list() + dirpath = os.path.join(os.getcwd(), self.dataset_dir) + for (dirpath, dirnames, filenames) in os.walk(dirpath): + ground_truth_list += [os.path.join(dirpath, file) for file in filenames if file.startswith("groundtruth")] return ground_truth_list class GroundTruthLoad: """ - The Ground Truth data object which contains features to: - * counter the JSON low-level data - * Todo: create logger object - - Attributes: + The Ground Truth data which contains the tracks and the corresponding + labels they belong to. The path to the related tracks' low-level data + (features in JSON format) can be extracted from this file too. """ def __init__(self, config, gt_filename, exports_path, log_level): """ - - :param config: - :param gt_filename: + Args: + config: + gt_filename: + exports_path: + log_level: """ self.config = config self.gt_filename = gt_filename @@ -66,15 +67,11 @@ def __init__(self, config, gt_filename, exports_path, log_level): def load_local_ground_truth(self): """ - Loads the the ground truth file. - * The directory with the dataset should be located inside the app folder location. - :return: + Loads the the ground truth file. The dataset directory is specified through + the parsing arguments of the create_classification_project method. """ - - self.dataset_dir = self.config.get("ground_truth_directory") - self.class_dir = self.config.get("class_dir") - with open(os.path.join(os.getcwd(), "{}/{}/metadata/{}".format( - self.dataset_dir, self.class_dir, self.gt_filename)), "r") as stream: + self.dataset_dir = self.config.get("dataset_dir") + with open(self.gt_filename, "r") as stream: try: self.ground_truth_data = yaml.safe_load(stream) print("Ground truth file loaded.") @@ -84,14 +81,21 @@ def load_local_ground_truth(self): def export_train_class(self): """ - - :return: + Returns: + The target class to be modeled. """ self.train_class = self.ground_truth_data["className"] print("EXPORT CLASS NAME: {}".format(self.train_class)) return self.train_class def export_gt_tracks(self): + """ + It takes a dictionary of the tracks from the groundtruth and it transforms it + to a list of tuples (track, label). Then it shuffles the list based on the seed + specified in the configuration file, and returns that shuffled list. + Returns: + A list of tuples with the tracks and their corresponding labels. + """ self.labeled_tracks = self.ground_truth_data["groundTruth"] tracks_list = [] for track, label in self.labeled_tracks.items(): @@ -99,19 +103,22 @@ def export_gt_tracks(self): print(colored("SEED is set to: {}".format(self.config.get("seed"), "cyan"))) random.seed(a=self.config.get("seed")) random.shuffle(tracks_list) + print("Listed tracks in GT file: {}".format(len(tracks_list))) return tracks_list def check_ground_truth_data(self): """ - Todo: description - :return: + Prints a dictionary of the groundtruth data in the corresponding yaml file. + It contains the target class and the tracks. """ pprint(self.ground_truth_data) def check_ground_truth_info(self): """ - Todo: description - :return: + Prints information about the groundtruth data that is loaded in a dictionary: + * The target class + * The tracks with their labels + * The tracks themselves """ len(self.ground_truth_data["groundTruth"].keys()) print("Ground truth data class/target: {}".format(self.ground_truth_data["className"])) @@ -120,8 +127,7 @@ def check_ground_truth_info(self): def check_tracks_folders(self): """ - Todo: function explanation docstring - :return: + Prints the directories that contain the low-level data. """ if len(self.labeled_tracks.keys()) is not 0: folders = [] @@ -139,7 +145,6 @@ def count_json_low_level_files(self): """ Prints the JSON low-level data that is contained inside the dataset directory (the dataset directory is declared in configuration file). - :return: """ counter = 0 for root, dirs, files in os.walk(os.path.join(os.getcwd(), self.dataset_dir)): @@ -151,6 +156,9 @@ def count_json_low_level_files(self): class DatasetExporter: + """ + TODO: Description + """ def __init__(self, config, tracks_list, train_class, exports_path, log_level): self.config = config self.tracks_list = tracks_list @@ -168,55 +176,51 @@ def __init__(self, config, tracks_list, train_class, exports_path, log_level): self.setting_logger() def setting_logger(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, - name="dataset_exports_transformations_{}".format(self.train_class), + name="train_model_{}".format(self.train_class), train_class=self.train_class, - mode="w", + mode="a", level=self.log_level).setup_logger() def create_df_tracks(self): """ - Creates the pandas DataFrame with the tracks. - Todo: more comments - :return: - DataFrame or None: a DataFrame with the tracks included in the ground truth yaml file containing the track name, - the path to load the JSON low-level data, the label, etc. Else, it returns None. + TODO: Description + Returns: + TODO: Description """ self.logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") # the class name from the ground truth data that is the target self.dataset_dir = self.config.get("ground_truth_directory") - self.class_dir = self.config.get("class_dir") + # self.class_dir = self.config.get("class_dir") print('DATASET-DIR', self.dataset_dir) - print('CLASS NAME PATH', self.class_dir) - # the path to the "features" directory that contains the rest of the low-level data sub-directories - path_features = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "features") - # check if the "features" directory is empty or contains the "mp3" or the "orig" sub-directory - low_level_dir = "" - if len(os.listdir(path_features)) == 0: - print("Directory is empty") - self.logger.warning("Directory is empty.") - else: - print("Directory is not empty") - self.logger.info("Directory is not empty") - directory_contents = os.listdir(path_features) - if "mp3" in directory_contents: - low_level_dir = "mp3" - elif "orig" in directory_contents: - low_level_dir = "orig" - else: - low_level_dir = "" - print("There is no valid low-level data inside the features directory") - self.logger.warning("There is no valid low-level data inside the features directory") - # print which directory contains the low-level sub-directories (if exist) - self.logger.info("Low-level directory name that contains the data: {}".format(low_level_dir)) - # path to the low-level data sub-directories - path_low_level = os.path.join(os.getcwd(), self.dataset_dir, self.class_dir, "features", low_level_dir) - self.logger.info("Path of low level data: {}".format(path_low_level)) - # create a list with dictionaries that contain the information from each track in - if low_level_dir != "": + # print('CLASS NAME PATH', self.class_dir) + dirpath = os.path.join(os.getcwd(), self.dataset_dir) + low_level_list = list() + for (dirpath, dirnames, filenames) in os.walk(dirpath): + low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] + if len(low_level_list) != 0: + self.logger.info("Low-level features for the tracks found.") + # processing the names of the tracks that are inside both the GT file and the low-level json files + # list with the tracks that are included in the low-level json files + tracks_existing_list = [e for e in self.tracks_list for i in low_level_list if e[0] in i] + # list with the low-level json tracks' paths that are included in tracks list + tracks_existing_path_list = [i for e in self.tracks_list for i in low_level_list if e[0] in i] + self.logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) + self.logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) + self.logger.debug("{}".format(tracks_existing_list[:4])) + self.logger.debug("{}".format(tracks_existing_path_list[:4])) + self.logger.debug("The founded tracks tracks listed successfully.") + self.logger.debug("Generate random number within a given range of listed tracks:") + # Random number between 0 and length of listed tracks + random_num = random.randrange(len(tracks_existing_list)) + self.logger.debug("Check if the tracks are the same in the same random index in both lists") + self.logger.debug("{}".format(tracks_existing_list[random_num])) + self.logger.debug("{}".format(tracks_existing_path_list[random_num])) + + self.tracks_list = tracks_existing_list + # create the dataframe with tracks that are bothe in low-level files and the GT file self.df_tracks = pd.DataFrame(data=self.tracks_list, columns=["track", self.train_class]) self.logger.debug("Shape of tracks DF created before cleaning: {}".format(self.df_tracks.shape)) self.logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") @@ -233,7 +237,7 @@ def create_df_tracks(self): self.logger.info("There are no NULL values found.") # export shuffled tracks to CSV format - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + exports_dir = self.config.get("exports_directory") tracks_path = FindCreateDirectory(self.exports_path, os.path.join(exports_dir, "tracks_csv_format")).inspect_directory() self.df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(self.train_class))) @@ -244,7 +248,7 @@ def create_df_tracks(self): self.df_feats = FeaturesDf(df_tracks=self.df_tracks, train_class=self.train_class, - path_low_level=path_low_level, + list_path_tracks=tracks_existing_path_list, config=self.config, exports_path=self.exports_path, log_level=self.log_level, @@ -254,4 +258,5 @@ def create_df_tracks(self): self.logger.info("Features, Labels, and Tracks are exported successfully..") return self.df_feats, self.y, self.df_tracks["track"].values else: + self.logger.error("No low-level data found.") return None, None, None diff --git a/models/sklearn/transformation/load_low_level.py b/models/sklearn/transformation/load_low_level.py index dd5ab1d34..3a0ad3ab0 100644 --- a/models/sklearn/transformation/load_low_level.py +++ b/models/sklearn/transformation/load_low_level.py @@ -1,8 +1,8 @@ import os import json import pandas as pd -from transformation.utils_preprocessing import flatten_dict_full -from logging_tool import LoggerSetup +from ..transformation.utils_preprocessing import flatten_dict_full +from ..helper_functions.logging_tool import LoggerSetup class FeaturesDf: @@ -12,10 +12,10 @@ class FeaturesDf: df_tracks (Pandas DataFrame): The tracks DataFrame that contains the track name, track low-level path, label, etc. """ - def __init__(self, df_tracks, train_class, path_low_level, config, exports_path, log_level): + def __init__(self, df_tracks, train_class, list_path_tracks, config, exports_path, log_level): self.df_tracks = df_tracks self.train_class = train_class - self.path_low_level = path_low_level + self.list_path_tracks = list_path_tracks self.config = config self.exports_path = exports_path self.log_level = log_level @@ -29,10 +29,9 @@ def __init__(self, df_tracks, train_class, path_low_level, config, exports_path, self.setting_logger() def setting_logger(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, - name="dataset_exports_transformations_{}".format(self.train_class), + name="train_model_{}".format(self.train_class), train_class=self.train_class, mode="a", level=self.log_level).setup_logger() @@ -41,17 +40,15 @@ def create_low_level_df(self): """ Creates the low-level DataFrame. Cleans also the low-level data from the unnecessary features before creating the DF. - - :return: - DataFrame: low-level features Daa=taFrame from all the tracks in the collection. + Returns: + The low-level features (pandas DataFrame) from all the tracks in the collection. """ self.logger.info("---- CREATE LOW LEVEL DATAFRAME ----") # clear the list if it not empty self.list_feats_tracks.clear() - for index, row in self.df_tracks.iterrows(): - path_low_data = os.path.join(self.path_low_level, "{}.json".format(row["track"])) + for track_low_level_path in self.list_path_tracks: try: - f = open(path_low_data) + f = open(track_low_level_path) data_feats_item = json.load(f, strict=False) except Exception as e: print("Exception occurred in loading file:", e) @@ -73,15 +70,14 @@ def create_low_level_df(self): # The dictionary's keys list is transformed to type self.df_feats_tracks = pd.DataFrame(self.list_feats_tracks, columns=list(self.list_feats_tracks[0].keys())) - self.logger.info("COLUMNS CONTAIN OBJECTS: \n{}".format( + self.logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( self.df_feats_tracks.select_dtypes(include=['object']).columns)) - self.logger.info("Exporting low-level data (dataframe)") + self.logger.info("Exporting low-level data (DataFrame)..") return self.df_feats_tracks def check_processing_info(self): """ Prints some information about the low-level data to DataFrame transformation step and its middle processes. - :return: """ self.logger.info('Items parsed and transformed: {}'.format(self.counter_items_transformed)) # The type of the dictionary's keys list is: @@ -92,8 +88,9 @@ def check_processing_info(self): def export_tracks_feats_df(self): """ - :return: - DataFrame: The tracks with all the ground truth data and the corresponding low-level data flattened. + Returns: + The tracks (pandas DataFrame) with all the ground truth data and the + corresponding low-level data flattened. """ self.logger.info("Concatenating the tracks/labels data DataFrame with the features DataFrame.") self.logger.info("TRACKS SHAPE: {}".format(self.df_tracks.shape)) diff --git a/models/sklearn/transformation/transform.py b/models/sklearn/transformation/transform.py index c0ba992c2..814f30e77 100644 --- a/models/sklearn/transformation/transform.py +++ b/models/sklearn/transformation/transform.py @@ -3,20 +3,21 @@ import collections import joblib import os +import six -from utils import FindCreateDirectory -from transformation.utils_preprocessing import list_descr_handler -from transformation.utils_preprocessing import feats_selector_list +from ..helper_functions.utils import FindCreateDirectory +from ..transformation.utils_preprocessing import list_descr_handler +from ..transformation.utils_preprocessing import feats_selector_list from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, QuantileTransformer from sklearn.pipeline import FeatureUnion from sklearn.pipeline import Pipeline -from logging_tool import LoggerSetup +from ..helper_functions.logging_tool import LoggerSetup # avoid the module's method call deprecation try: - collectionsAbc = collections.abc + collectionsAbc = six.moves.collections_abc except AttributeError: collectionsAbc = collections @@ -41,10 +42,9 @@ def __init__(self, config, df_feats, process, train_class, exports_path, log_lev self.setting_logger() def setting_logger(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, - name="dataset_exports_transformations_{}".format(self.train_class), + name="train_model_{}".format(self.train_class), train_class=self.train_class, mode="a", level=self.log_level).setup_logger() @@ -57,7 +57,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + exports_dir = self.config.get("exports_directory") models_path = FindCreateDirectory(self.exports_path, os.path.join(exports_dir, "models")).inspect_directory() @@ -262,4 +262,4 @@ def fit(self, X, y=None): return self def transform(self, X): - return X[self.attribute_names].values \ No newline at end of file + return X[self.attribute_names].values diff --git a/models/sklearn/transformation/transform_predictions.py b/models/sklearn/transformation/transform_predictions.py index 514c4e616..ec749db97 100644 --- a/models/sklearn/transformation/transform_predictions.py +++ b/models/sklearn/transformation/transform_predictions.py @@ -3,16 +3,17 @@ import collections import joblib import os +import six -from utils import FindCreateDirectory -from transformation.utils_preprocessing import list_descr_handler -from transformation.utils_preprocessing import feats_selector_list from sklearn.base import BaseEstimator, TransformerMixin -from logging_tool import LoggerSetup +from ..helper_functions.utils import FindCreateDirectory +from ..transformation.utils_preprocessing import list_descr_handler +from ..transformation.utils_preprocessing import feats_selector_list +from ..helper_functions.logging_tool import LoggerSetup # avoid the module's method call deprecation try: - collectionsAbc = collections.abc + collectionsAbc = six.moves.collections_abc except AttributeError: collectionsAbc = collections @@ -36,7 +37,6 @@ def __init__(self, config, df_feats, process, train_class, exports_path, log_lev self.setting_logger() def setting_logger(self): - # set up logger self.logger = LoggerSetup(config=self.config, exports_path=self.exports_path, name="predict_{}".format(self.train_class), @@ -53,7 +53,7 @@ def post_processing(self): self.list_features = list(self.df_feats.columns) - exports_dir = "{}_{}".format(self.config.get("exports_directory"), self.train_class) + exports_dir = self.config.get("exports_directory") models_path = FindCreateDirectory(self.exports_path, os.path.join(exports_dir, "models")).inspect_directory() @@ -178,4 +178,4 @@ def fit(self, X, y=None): return self def transform(self, X): - return X[self.attribute_names].values \ No newline at end of file + return X[self.attribute_names].values diff --git a/models/sklearn/transformation/utils_preprocessing.py b/models/sklearn/transformation/utils_preprocessing.py index 9375e739f..097beceab 100644 --- a/models/sklearn/transformation/utils_preprocessing.py +++ b/models/sklearn/transformation/utils_preprocessing.py @@ -1,18 +1,15 @@ -import os import re -import pandas as pd import collections -from sklearn.preprocessing import OneHotEncoder -import joblib -from utils import load_yaml, FindCreateDirectory, TrainingProcesses def flatten_dict_full(dictionary, sep="_"): """ + Args: + dictionary: + sep: + + Returns: - :param dictionary: - :param sep: - :return: """ obj = collections.OrderedDict() @@ -33,9 +30,11 @@ def recurse(t, parent_key=""): def list_descr_handler(descr_list): """ + Args: + descr_list: + + Returns: - :param descr_list: - :return: """ keys_list_handle = [] for item in descr_list: @@ -52,10 +51,12 @@ def list_descr_handler(descr_list): def feats_selector_list(df_feats_columns, feats_select_list): """ + Args: + df_feats_columns: + feats_select_list: + + Returns: - :param df_feats_columns: - :param feats_select_list: - :return: """ columns_list = list(df_feats_columns) columns_select_list = [] diff --git a/requirements.txt b/requirements.txt index 6d9687498..61aef1103 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,4 @@ pyyaml==5.3.1 rauth == 0.7.3 setproctitle == 1.1.10 six==1.14.0 --r docs/requirements.txt +-r docs/requirements.txt \ No newline at end of file From 54ab98724862eee683d613d15752948b4fd3ff51 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 20 Aug 2020 12:08:50 +0300 Subject: [PATCH 03/64] readme added --- models/sklearn/README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/models/sklearn/README.md b/models/sklearn/README.md index b5b4ae443..4cda04d9d 100644 --- a/models/sklearn/README.md +++ b/models/sklearn/README.md @@ -1,6 +1,6 @@ # Machine Learning Infrastructure with scikit-learn (GSoC 2020) -This repository contains the tool that is built for training SVM models of +This folder contains the tool that is built for training SVM models of AcousticBrainz's datasets, as well as predicting where a single AcousticBrainz track instance can be classified based on the trained models. It is part of the *Google Summer of Code 2020* in collaboration with the **MetaBrainz** Open-Source @@ -62,7 +62,7 @@ verbose Controls the verbosity (int) of the Grid Search print me on the console: the higher, the more messages. ``` -For example, a path directory structure could be like this one: +For example, a dataset path directory structure could be like this one: dataset (e.g. danceability) |- features @@ -227,6 +227,3 @@ that are followed in this mode are: * the predicted class * the score of the predicted class * the probabilities for each class the model took to decide to which one the track will be classified. - - - From c394fc4bd8acc9f7c4700831b76799a3f233f3db Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 20 Aug 2020 18:06:00 +0300 Subject: [PATCH 04/64] python 3 installation --- docker/Dockerfile.py3 => Dockerfile.py3 | 19 +++---------------- docker/docker-compose.dev.yml | 13 +++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) rename docker/Dockerfile.py3 => Dockerfile.py3 (81%) diff --git a/docker/Dockerfile.py3 b/Dockerfile.py3 similarity index 81% rename from docker/Dockerfile.py3 rename to Dockerfile.py3 index 689ae742a..70018bcf5 100644 --- a/docker/Dockerfile.py3 +++ b/Dockerfile.py3 @@ -48,21 +48,8 @@ RUN useradd --create-home --shell /bin/bash --uid 901 --gid 901 acousticbrainz RUN chown acousticbrainz:acousticbrainz /code # Python dependencies -RUN mkdir /code/docs/ && chown acousticbrainz:acousticbrainz /code/docs/ -COPY --chown=acousticbrainz:acousticbrainz docs/requirements.txt /code/docs/requirements.txt -COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - - -FROM acousticbrainz-base AS acousticbrainz-dev - -COPY --chown=acousticbrainz:acousticbrainz requirements_development.txt /code/requirements_development.txt -RUN pip install --no-cache-dir -r requirements_development.txt - -# install sklearn ML tool requirements -RUN mkdir /code/models/sklearn/ && chown acousticbrainz:acousticbrainz /code/models/sklearn/ COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -r /code/models/sklearn/requirements.txt # We don't copy code to the dev image because it's added with a volume mount @@ -72,7 +59,7 @@ FROM acousticbrainz-dev AS acousticbrainz-test COPY . /code -FROM acousticbrainz-base AS acousticbrainz-prod +FROM acousticbrainz-sklearn AS acousticbrainz-prod USER root RUN pip install --no-cache-dir uWSGI==2.0.17.1 @@ -117,4 +104,4 @@ COPY --chown=acousticbrainz:acousticbrainz . /code RUN npm run build:prod # Our entrypoint runs as root -USER root +USER root \ No newline at end of file diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index c900ff68c..a38dedac1 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -62,3 +62,16 @@ services: - ../data/files:/data/files depends_on: - db + + dataset_evaluator_sklearn: + build: + context: .. + dockerfile: Dockerfile.py3 + target: acousticbrainz-sklearn + command: python2 worker_manage.py dataset_evaluator + volumes: + - ../:/code + - ../data/datasets:/data/datasets + - ../data/files:/data/files + depends_on: + - db From 7b81889021adf43827619ce59ded08ddfc24991d Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 21 Aug 2020 22:57:49 +0300 Subject: [PATCH 05/64] add init.py in the sklearn dir for relative importing --- models/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 models/__init__.py diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/models/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- From 6d0957d96b7c7091ac3e8a3370f16c9a304ab025 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 24 Aug 2020 16:07:01 +0300 Subject: [PATCH 06/64] add environment variable for sklearn container --- dataset_eval/evaluate.py | 13 +++++++++---- docker/docker-compose.dev.yml | 2 ++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 90e0b4948..c5e20831d 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -18,7 +18,9 @@ import utils.path from dataset_eval import artistfilter from dataset_eval import gaia_wrapper -from models.sklearn.model.classification_project import create_classification_project +is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") +if is_sklearn == "1": + from models.sklearn.model.classification_project import create_classification_project SLEEP_DURATION = 30 # number of seconds to wait between runs @@ -103,9 +105,12 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora })) -def evaluate_sklearn(eval_location, groundtruth_path, filelist_path, storage_dir, eval_job): - # create_classification_project(ground_truth_directory=groundtruth_path) - pass +def evaluate_sklearn(eval_location, dataset_dir, storage_dir, eval_job): + create_classification_project(ground_truth_directory=dataset_dir, + project_file=eval_job["id"], + exports_directory=eval_job["id"], + exports_path=eval_location + ) def create_groundtruth_dict(name, datadict): diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index a38dedac1..0a59df257 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -68,6 +68,8 @@ services: context: .. dockerfile: Dockerfile.py3 target: acousticbrainz-sklearn + environment: + MODEL_TRAINING_SKLEARN: 1 command: python2 worker_manage.py dataset_evaluator volumes: - ../:/code From 2ca4d41a9910ccf7a78458fcd6d266cdff22ceed Mon Sep 17 00:00:00 2001 From: Pantelis Date: Tue, 25 Aug 2020 13:51:15 +0300 Subject: [PATCH 07/64] add c_values, gamma_values, preprocessing_values arguments in training --- dataset_eval/evaluate.py | 9 ++++-- models/sklearn/README.md | 14 ++++++++++ models/sklearn/classification/train_class.py | 28 ++++++++++++++++++- .../sklearn/model/classification_project.py | 3 +- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index c5e20831d..ce6e00e21 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -92,7 +92,7 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora filelist_file=filelist_path, c_values=options.get("c_values", []), gamma_values=options.get("gamma_values", []), - preprocessing_values=options.get("preprocessing_values", []), + preprocessing_values=options.get("preprocessing_values", []) ) logging.info("Saving results...") save_history_file(storage_dir, results["history_path"], eval_job["id"]) @@ -105,11 +105,14 @@ def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, stora })) -def evaluate_sklearn(eval_location, dataset_dir, storage_dir, eval_job): +def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job): create_classification_project(ground_truth_directory=dataset_dir, project_file=eval_job["id"], exports_directory=eval_job["id"], - exports_path=eval_location + exports_path=eval_location, + c_values=options.get("c_values", []), + gamma_values=options.get("gamma_values", []), + preprocessing_values=options.get("preprocessing_values", []) ) diff --git a/models/sklearn/README.md b/models/sklearn/README.md index 4cda04d9d..3287d075c 100644 --- a/models/sklearn/README.md +++ b/models/sklearn/README.md @@ -46,6 +46,20 @@ path Path where the project results will be stored. If empty, optional parameters: +c_values The C values parameter (list) for the SVM Grid Search + (e.g. [-2, 3, 5, 10]). In case of None, the values will be set up + by the specified in the configuration template. + +gamma_values The gamma values parameter (list) for the SVM Grid Search + (e.g. [ 3, 1, -1, -3]). In case of None, the values will be set up + by the specified in the configuration template. + +preprocessing_values: The preprocessing values parameter (list) for the + SVM Grid Search. They must be one or more of the following list: + ["basic", "lowlevel", "nobands", "normalized", "gaussianized"] + In case of None, the values will be set up + by the specified in the configuration template. + logging The logging level (int) that will be printed (0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR, 4: CRITICAL). Can be set only in the prescribed integer values (0, 1, 2, 3, 4) diff --git a/models/sklearn/classification/train_class.py b/models/sklearn/classification/train_class.py index 8e3f07d70..d164e41a7 100644 --- a/models/sklearn/classification/train_class.py +++ b/models/sklearn/classification/train_class.py @@ -8,7 +8,7 @@ from ..helper_functions.logging_tool import LoggerSetup -def train_class(config, gt_file, exports_directory, log_level): +def train_class(config, gt_file, exports_directory, c_values, gamma_values, preprocessing_values, log_level): exports_path = config["exports_path"] gt_data = GroundTruthLoad(config, gt_file, exports_path, log_level) # tracks shuffled and exported @@ -25,6 +25,11 @@ def train_class(config, gt_file, exports_directory, log_level): else: config["exports_directory"] = exports_directory + config = update_parameters(config=config, + c_values=c_values, + gamma_values=gamma_values, + preprocessing_values=preprocessing_values) + logger = LoggerSetup(config=config, exports_path=exports_path, name="train_model_{}".format(class_name), @@ -72,3 +77,24 @@ def train_class(config, gt_file, exports_directory, log_level): classification_time = model_manage.apply_processing() print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) logger.info("Classification ended successfully in {} minutes.".format(classification_time)) + + +def update_parameters(config, c_values, gamma_values, preprocessing_values): + """Update the project file with user-provided preferences + + Args: + config: The config data to be updated. + c_values: C value to be updated. + gamma_values: gamma value to be updated. + preprocessing_values: preprocessing values to be updated. + """ + for pref in config['classifiers']['svm']: + if c_values: + pref['C'] = c_values + if gamma_values: + pref['gamma'] = gamma_values + if preprocessing_values: + pref['preprocessing'] = preprocessing_values + + return config + diff --git a/models/sklearn/model/classification_project.py b/models/sklearn/model/classification_project.py index 59a0b8d9b..340b077e6 100644 --- a/models/sklearn/model/classification_project.py +++ b/models/sklearn/model/classification_project.py @@ -7,6 +7,7 @@ def create_classification_project(ground_truth_directory, project_file=None, exports_directory=None, exports_path=None, + c_values=None, gamma_values=None, preprocessing_values=None, seed=None, jobs=-1, verbose=1, logging="logging.INFO"): """ Args: @@ -74,7 +75,7 @@ def create_classification_project(ground_truth_directory, project_file=None, exp print(gt_files_list) print("LOAD GROUND TRUTH") for gt_file in gt_files_list: - train_class(project_template, gt_file, exports_directory, logging) + train_class(project_template, gt_file, exports_directory, c_values, gamma_values, preprocessing_values, logging) if __name__ == '__main__': From a988a3dc452ce4357bd33842cf4bcc2d82a495a2 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Wed, 26 Aug 2020 12:11:04 +0300 Subject: [PATCH 08/64] add field of sklearn choice in evaluation mode --- config.py.example | 3 +++ dataset_eval/evaluate.py | 10 ++++++++-- db/dataset_eval.py | 9 ++++++--- webserver/forms.py | 8 ++++++++ webserver/templates/datasets/evaluate.html | 8 ++++++++ webserver/views/datasets.py | 1 + 6 files changed, 34 insertions(+), 5 deletions(-) diff --git a/config.py.example b/config.py.example index 8c5ff4073..624af73ec 100644 --- a/config.py.example +++ b/config.py.example @@ -77,4 +77,7 @@ FEATURE_EVAL_FILTERING = True # Choose settings used for model training FEATURE_EVAL_MODEL_SELECTION = False +# Choose the ML tool used for model training (gaia/sklearn) +FEATURE_EVAL_TOOL_SELECTION = False + DEBUG_TB_INTERCEPT_REDIRECTS = False diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index ce6e00e21..a1bee42d3 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -62,9 +62,15 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): with open(groundtruth_path, "w") as f: yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) + evaluation_tool_selection = eval_job["options"].get("evaluation_tool_value", "gaia") + logging.info("TOOL: {}".format(evaluation_tool_selection)) + if evaluation_tool_selection == "gaia": + logging.info("Training GAIA model...") + evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) + elif evaluation_tool_selection == "sklearn": + logging.info("Training SKLEARN model...") + evaluate_sklearn(eval_job["options"], eval_location, dataset_dir, storage_dir, eval_job) - logging.info("Training GAIA model...") - evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) diff --git a/db/dataset_eval.py b/db/dataset_eval.py index aed4e97d3..ef7e69ef9 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -46,7 +46,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_values=None, - preprocessing_values=None, filter_type=None): + preprocessing_values=None, filter_type=None, evaluation_tool_value="gaia"): """Add dataset into evaluation queue. Args: @@ -67,6 +67,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ filter_type: Optional filtering that will be applied to the dataset. See FILTER_* variables in this module for a list of existing filters. + evaluation_tool_value (optional): A string choice between two strings, gaia or sklearn. Raises: JobExistsException: if the dataset has already been submitted for evaluation @@ -90,7 +91,8 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ # Validate dataset contents validate_dataset_contents(db.dataset.get(dataset_id)) return _create_job(connection, dataset_id, normalize, eval_location, - c_values, gamma_values, preprocessing_values, filter_type) + c_values, gamma_values, preprocessing_values, filter_type, + evaluation_tool_value) def job_exists(dataset_id): @@ -330,7 +332,7 @@ def add_dataset_eval_set(connection, data): def _create_job(connection, dataset_id, normalize, eval_location, c_value, - gamma_value, preprocessing_values, filter_type): + gamma_value, preprocessing_values, filter_type, evaluation_tool_value): if not isinstance(normalize, bool): raise ValueError("Argument 'normalize' must be a boolean.") if filter_type is not None: @@ -345,6 +347,7 @@ def _create_job(connection, dataset_id, normalize, eval_location, c_value, "c_values": c_value, "gamma_values": gamma_value, "preprocessing_values": preprocessing_values, + "evaluation_tool_value": evaluation_tool_value } snapshot_id = db.dataset.create_snapshot(dataset_id) diff --git a/webserver/forms.py b/webserver/forms.py index d13b533b6..4b2b50dd3 100644 --- a/webserver/forms.py +++ b/webserver/forms.py @@ -9,6 +9,9 @@ DATASET_EVAL_LOCAL = "local" DATASET_EVAL_REMOTE = "remote" +DATASET_TOOL_EVALUATION_GAIA = "gaia" +DATASET_TOOL_EVALUATION_SKLEARN = "sklearn" + DATASET_PENDING = "pending" DATASET_RUNNING = "running" DATASET_DONE = "done" @@ -59,6 +62,11 @@ class DatasetEvaluationForm(FlaskForm): render_kw={"data-toggle": "collapse", "data-target": "#collapseSvmOptions"}) + evaluation_tool_value = SelectField("What tool do you want to use (sklearn/gaia)", choices=[ + (DATASET_TOOL_EVALUATION_GAIA, "gaia"), + (DATASET_TOOL_EVALUATION_SKLEARN, "sklearn")], + default=DATASET_TOOL_EVALUATION_GAIA) + # C parameter to SVM c_value = StringField('C Values', default=DATASET_C_VALUE, render_kw={"data-default": DATASET_C_VALUE}) diff --git a/webserver/templates/datasets/evaluate.html b/webserver/templates/datasets/evaluate.html index 177c20b05..3793915f1 100644 --- a/webserver/templates/datasets/evaluate.html +++ b/webserver/templates/datasets/evaluate.html @@ -86,6 +86,14 @@

Evaluate dataset "{{ dataset['name'] }}"

{% endif %} + {% if config.get('FEATURE_EVAL_TOOL_SELECTION') %} +
+
+ +
{{ form.evaluation_tool_value(class="form-control", required="required") }}
+
+
+ {% endif %}
diff --git a/webserver/views/datasets.py b/webserver/views/datasets.py index 931734219..9015117d5 100644 --- a/webserver/views/datasets.py +++ b/webserver/views/datasets.py @@ -253,6 +253,7 @@ def evaluate(dataset_id): gamma_values=gamma_values, preprocessing_values=preprocessing_values, filter_type=form.filter_type.data, + evaluation_tool_value=form.evaluation_tool_value.data ) flash.info("Dataset %s has been added into evaluation queue." % ds["id"]) except db.dataset_eval.IncompleteDatasetException as e: From ab3657fbeae93a7f0a039867d7a88e535c40105b Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 27 Aug 2020 17:28:09 +0300 Subject: [PATCH 09/64] add all requirements.txt --- Dockerfile.py3 | 15 ++++++++++++++- requirements.txt | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Dockerfile.py3 b/Dockerfile.py3 index 70018bcf5..aedff5c76 100644 --- a/Dockerfile.py3 +++ b/Dockerfile.py3 @@ -48,10 +48,23 @@ RUN useradd --create-home --shell /bin/bash --uid 901 --gid 901 acousticbrainz RUN chown acousticbrainz:acousticbrainz /code # Python dependencies +RUN mkdir /code/docs/ && chown acousticbrainz:acousticbrainz /code/docs/ +COPY --chown=acousticbrainz:acousticbrainz docs/requirements.txt /code/docs/requirements.txt +COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + + +FROM acousticbrainz-sklearn AS acousticbrainz-dev + +COPY --chown=acousticbrainz:acousticbrainz requirements_development.txt /code/requirements_development.txt +RUN pip install --no-cache-dir -r requirements_development.txt + +# Python dependencies for sklearn COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt RUN pip install --no-cache-dir -r /code/models/sklearn/requirements.txt + # We don't copy code to the dev image because it's added with a volume mount # during development, however it's needed for tests. Add it here. FROM acousticbrainz-dev AS acousticbrainz-test @@ -104,4 +117,4 @@ COPY --chown=acousticbrainz:acousticbrainz . /code RUN npm run build:prod # Our entrypoint runs as root -USER root \ No newline at end of file +USER root diff --git a/requirements.txt b/requirements.txt index 87f82f554..5c7a3b1ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ Flask-Login==0.5.0 Flask-SQLAlchemy==2.4.1 Flask-Testing==0.8.0 Flask-WTF == 0.14.3 -futures==3.3.0 +futures == 3.3.0; python_version < '3.0' mock==3.0.5 musicbrainzngs==0.7.1 ndg-httpsclient==0.5.1 From 255d310583e921bf1941c84005e7e1fcf48683bf Mon Sep 17 00:00:00 2001 From: Pantelis Date: Fri, 28 Aug 2020 16:28:00 +0300 Subject: [PATCH 10/64] add new env in dataset_evaluator for gaia --- dataset_eval/evaluate.py | 10 ++++++++-- docker/docker-compose.dev.yml | 4 +++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index a1bee42d3..5d09866d1 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -7,7 +7,6 @@ import tempfile import time -import gaia2.fastyaml as yaml from flask import current_app import db @@ -16,11 +15,18 @@ import db.dataset_eval import db.exceptions import utils.path +import yaml from dataset_eval import artistfilter -from dataset_eval import gaia_wrapper + is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") if is_sklearn == "1": from models.sklearn.model.classification_project import create_classification_project + +is_gaia = os.getenv("MODEL_TRAINING_GAIA") +if is_gaia == "1": + # import gaia2.fastyaml as yaml + from dataset_eval import gaia_wrapper + SLEEP_DURATION = 30 # number of seconds to wait between runs diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 0a59df257..ec4945f9d 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -55,6 +55,8 @@ services: context: .. dockerfile: Dockerfile target: acousticbrainz-dev + environment: + MODEL_TRAINING_GAIA: 1 command: python2 worker_manage.py dataset_evaluator volumes: - ../:/code @@ -70,7 +72,7 @@ services: target: acousticbrainz-sklearn environment: MODEL_TRAINING_SKLEARN: 1 - command: python2 worker_manage.py dataset_evaluator + command: python3 worker_manage.py dataset_evaluator volumes: - ../:/code - ../data/datasets:/data/datasets From cb08299a6fdd521a556dbd6cfea0745d81651ed0 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 28 Aug 2020 17:48:26 +0300 Subject: [PATCH 11/64] PEP* issues fixed --- models/sklearn/classification/classification_task_manager.py | 2 +- models/sklearn/classification/train_class.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/models/sklearn/classification/classification_task_manager.py b/models/sklearn/classification/classification_task_manager.py index 935bce944..94f78ce02 100644 --- a/models/sklearn/classification/classification_task_manager.py +++ b/models/sklearn/classification/classification_task_manager.py @@ -85,7 +85,7 @@ def files_existence(self): os.path.join(self.exports_dir, "images")).inspect_directory() # reports self.reports_path = FindCreateDirectory(self.exports_path, - os.path.join(self.exports_dir, "reports")).inspect_directory() + os.path.join(self.exports_dir, "reports")).inspect_directory() def config_file_analysis(self): """ diff --git a/models/sklearn/classification/train_class.py b/models/sklearn/classification/train_class.py index d164e41a7..74c7eaba3 100644 --- a/models/sklearn/classification/train_class.py +++ b/models/sklearn/classification/train_class.py @@ -36,9 +36,7 @@ def train_class(config, gt_file, exports_directory, c_values, gamma_values, prep train_class=class_name, mode="w", level=log_level).setup_logger() - logger.info("---- TRAINING FOR THE {} MODEL HAS JUST STARTED ----".format(class_name)) - logger.debug("Type of exported GT data exported: {}".format(type(tracks_listed_shuffled))) # name the project file @@ -97,4 +95,3 @@ def update_parameters(config, c_values, gamma_values, preprocessing_values): pref['preprocessing'] = preprocessing_values return config - From 330f880b440dae113e5c77235a9db02bfe42e194 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 28 Aug 2020 18:08:39 +0300 Subject: [PATCH 12/64] modify print --- dataset_eval/artistfilter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_eval/artistfilter.py b/dataset_eval/artistfilter.py index a8162ac2c..5a6fde0d0 100644 --- a/dataset_eval/artistfilter.py +++ b/dataset_eval/artistfilter.py @@ -28,7 +28,7 @@ def print_datadict_summary(datadict): for r, cls in datadict.items(): counter[cls] += 1 for cls, count in counter.most_common(): - print "%s\t\t%s" % (cls, count) + print("%s\t\t%s" % (cls, count)) def normalise_datadict(datadict, cut_to): """Take a dictionary of groundtruth and cut all classes to From a23a40a393d86b0a7d94b7b7dd7cfc66be791b7b Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 28 Aug 2020 21:07:58 +0300 Subject: [PATCH 13/64] change section where sklearn dependencies are loaded --- Dockerfile.py3 | 9 ++++----- dataset_eval/artistfilter.py | 1 + webserver/__init__.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile.py3 b/Dockerfile.py3 index aedff5c76..5460cf91f 100644 --- a/Dockerfile.py3 +++ b/Dockerfile.py3 @@ -53,17 +53,16 @@ COPY --chown=acousticbrainz:acousticbrainz docs/requirements.txt /code/docs/requ COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.txt RUN pip install --no-cache-dir -r requirements.txt +# Python dependencies for sklearn +COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt +RUN pip install --no-cache-dir -r /code/models/sklearn/requirements.txt + FROM acousticbrainz-sklearn AS acousticbrainz-dev COPY --chown=acousticbrainz:acousticbrainz requirements_development.txt /code/requirements_development.txt RUN pip install --no-cache-dir -r requirements_development.txt -# Python dependencies for sklearn -COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt -RUN pip install --no-cache-dir -r /code/models/sklearn/requirements.txt - - # We don't copy code to the dev image because it's added with a volume mount # during development, however it's needed for tests. Add it here. diff --git a/dataset_eval/artistfilter.py b/dataset_eval/artistfilter.py index 5a6fde0d0..c534edaf3 100644 --- a/dataset_eval/artistfilter.py +++ b/dataset_eval/artistfilter.py @@ -1,3 +1,4 @@ +from __future__ import print_function import collections import json import logging diff --git a/webserver/__init__.py b/webserver/__init__.py index 796dc14ca..88dc1e476 100644 --- a/webserver/__init__.py +++ b/webserver/__init__.py @@ -8,7 +8,7 @@ import os import time -import urlparse +from six.moves import urllib API_PREFIX = '/api/' @@ -137,7 +137,7 @@ def after_request_callbacks(response): def prod_https_login_redirect(): """ Redirect to HTTPS in production except for the API endpoints """ - if urlparse.urlsplit(request.url).scheme == 'http' \ + if urllib.parse.urlsplit(request.url).scheme == 'http' \ and app.config['DEBUG'] == False \ and app.config['TESTING'] == False \ and request.blueprint not in ('api', 'api_v1_core', 'api_v1_datasets', 'api_v1_dataset_eval'): From daaaf23e50291bc92e74ef40cf7c0f628bf2f376 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 31 Aug 2020 19:06:36 +0300 Subject: [PATCH 14/64] Python 3 compatibility issues fix - GSoC commit --- webserver/__init__.py | 2 +- webserver/views/datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/webserver/__init__.py b/webserver/__init__.py index 88dc1e476..b8a826875 100644 --- a/webserver/__init__.py +++ b/webserver/__init__.py @@ -110,7 +110,7 @@ def after_request_callbacks(response): init_error_handlers(app) # Static files - import static_manager + from webserver import static_manager # Template utilities app.jinja_env.add_extension('jinja2.ext.do') diff --git a/webserver/views/datasets.py b/webserver/views/datasets.py index 9015117d5..8f5ab45a7 100644 --- a/webserver/views/datasets.py +++ b/webserver/views/datasets.py @@ -15,7 +15,7 @@ import csv import math import six -import StringIO +from six import StringIO from webserver.views.api.exceptions import APIUnauthorized # Below values are defined in 'classification_project_template.yaml' file. From a3cf6cb883fbe09f1c15fedd7c1616a2bf5db21e Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 3 Sep 2020 18:24:20 +0300 Subject: [PATCH 15/64] querying dataset_eval_jobs with evaluation_tool_value --- db/dataset_eval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/db/dataset_eval.py b/db/dataset_eval.py index ef7e69ef9..72bcdce10 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -166,7 +166,7 @@ def validate_dataset_contents(dataset): ) -def get_next_pending_job(): +def get_next_pending_job(evaluation_tool_value="gaia"): """ Get the earliest submitted job which is still in the pending state. @@ -181,9 +181,10 @@ def get_next_pending_job(): ON dataset_snapshot.id = dataset_eval_jobs.snapshot_id WHERE status = :status AND eval_location = 'local' + AND options->>'evaluation_tool_value' = %s ORDER BY created ASC LIMIT 1 - """ % EVAL_COLUMNS_COMMA_SEPARATED) + """ % (EVAL_COLUMNS_COMMA_SEPARATED, evaluation_tool_value)) result = connection.execute(query, {"status": STATUS_PENDING}) row = result.fetchone() return dict(row) if row else None From ad75c7c7b0eb46bcae737e25bcf7eb12ef311e71 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 3 Sep 2020 20:10:17 +0300 Subject: [PATCH 16/64] query dataset_eval_jobs with tool value fixed --- db/dataset_eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db/dataset_eval.py b/db/dataset_eval.py index 72bcdce10..f945a2689 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -181,11 +181,11 @@ def get_next_pending_job(evaluation_tool_value="gaia"): ON dataset_snapshot.id = dataset_eval_jobs.snapshot_id WHERE status = :status AND eval_location = 'local' - AND options->>'evaluation_tool_value' = %s + AND options->>AND options->>'evaluation_tool_value' = :evaluation_tool_value ORDER BY created ASC LIMIT 1 - """ % (EVAL_COLUMNS_COMMA_SEPARATED, evaluation_tool_value)) - result = connection.execute(query, {"status": STATUS_PENDING}) + """ % EVAL_COLUMNS_COMMA_SEPARATED) + result = connection.execute(query, {"status": STATUS_PENDING, "evaluation_tool_value": evaluation_tool_value}) row = result.fetchone() return dict(row) if row else None From 12ce6f921a8aefc5261f623f37d4229cf5b9d30c Mon Sep 17 00:00:00 2001 From: Pantelis Date: Fri, 4 Sep 2020 14:16:53 +0300 Subject: [PATCH 17/64] ad evaluation tool choice from environment variable --- dataset_eval/evaluate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 5d09866d1..f46512852 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -18,14 +18,17 @@ import yaml from dataset_eval import artistfilter +eval_tool_use = "gaia" is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") if is_sklearn == "1": from models.sklearn.model.classification_project import create_classification_project + eval_tool_use = "sklearn" is_gaia = os.getenv("MODEL_TRAINING_GAIA") if is_gaia == "1": # import gaia2.fastyaml as yaml from dataset_eval import gaia_wrapper + eval_tool_use = "gaia" SLEEP_DURATION = 30 # number of seconds to wait between runs @@ -35,7 +38,7 @@ def main(): dataset_dir = current_app.config["DATASET_DIR"] storage_dir = os.path.join(current_app.config["FILE_STORAGE_DIR"], "history") while True: - pending_job = db.dataset_eval.get_next_pending_job() + pending_job = db.dataset_eval.get_next_pending_job(eval_tool_use) if pending_job: logging.info("Processing job %s..." % pending_job["id"]) evaluate_dataset(pending_job, dataset_dir, storage_dir) From cbf4c85155f69d0c1b2a3d11d9a3cb7a22380357 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 4 Sep 2020 18:42:23 +0300 Subject: [PATCH 18/64] fix query for getting the next pending job --- db/dataset_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/dataset_eval.py b/db/dataset_eval.py index f945a2689..16a9cb69f 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -181,7 +181,7 @@ def get_next_pending_job(evaluation_tool_value="gaia"): ON dataset_snapshot.id = dataset_eval_jobs.snapshot_id WHERE status = :status AND eval_location = 'local' - AND options->>AND options->>'evaluation_tool_value' = :evaluation_tool_value + AND options->>'evaluation_tool_value' = :evaluation_tool_value ORDER BY created ASC LIMIT 1 """ % EVAL_COLUMNS_COMMA_SEPARATED) From a323fce69ed26671738e6a6a4195a54850e84ee7 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 7 Sep 2020 18:49:06 +0300 Subject: [PATCH 19/64] dump yaml with SafeDumper --- dataset_eval/evaluate.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index f46512852..10d242d11 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -69,7 +69,8 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") with open(groundtruth_path, "w") as f: - yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) + # yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) + yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f, Dumper=yaml.SafeDumper) evaluation_tool_selection = eval_job["options"].get("evaluation_tool_value", "gaia") logging.info("TOOL: {}".format(evaluation_tool_selection)) @@ -135,13 +136,13 @@ def create_groundtruth_dict(name, datadict): groundtruth = { "type": "unknown", # TODO: See if that needs to be modified. "version": 1.0, - "className": db.dataset._slugify(unicode(name)), + "className": db.dataset._slugify(name), "groundTruth": {}, } for r, cls in datadict.items(): - if isinstance(r, unicode): - r = r.encode("UTF-8") - groundtruth["groundTruth"][r] = cls.encode("UTF-8") + # if isinstance(r, unicode): + # r = r.encode("UTF-8") + groundtruth["groundTruth"][r] = cls return groundtruth @@ -150,12 +151,12 @@ def create_groundtruth(dataset): groundtruth = { "type": "unknown", # TODO: See if that needs to be modified. "version": 1.0, - "className": db.dataset._slugify(unicode(dataset["name"])), + "className": db.dataset._slugify(dataset["name"]), "groundTruth": {}, } for cls in dataset["classes"]: for recording_mbid in cls["recordings"]: - groundtruth["groundTruth"][recording_mbid] = cls["name"].encode("UTF-8") + groundtruth["groundTruth"][recording_mbid] = cls["name"] return groundtruth From db33b66f6f9d2849da3b4f34bec3d775d3347adc Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Fri, 11 Sep 2020 17:12:32 +0300 Subject: [PATCH 20/64] store locally json files of low-data - 01 --- dataset_eval/evaluate.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 10d242d11..63c16675e 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -54,17 +54,23 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): utils.path.create_path(eval_location) temp_dir = tempfile.mkdtemp() + evaluation_tool_selection = eval_job["options"].get("evaluation_tool_value", "gaia") + logging.info("TOOL: {}".format(evaluation_tool_selection)) + try: snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"]) train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"]) db.dataset_eval.add_sets_to_job(eval_job["id"], train, test) - logging.info("Generating filelist.yaml and copying low-level data for evaluation...") - filelist_path = os.path.join(eval_location, "filelist.yaml") - filelist = dump_lowlevel_data(train.keys(), temp_dir) - with open(filelist_path, "w") as f: - yaml.dump(filelist, f) + if evaluation_tool_selection == "gaia": + logging.info("Generating filelist.yaml and copying low-level data for evaluation...") + filelist_path = os.path.join(eval_location, "filelist.yaml") + filelist = dump_lowlevel_data(train.keys(), temp_dir) + with open(filelist_path, "w") as f: + yaml.dump(filelist, f) + elif evaluation_tool_selection == "sklearn": + dump_lowlevel_data_sklearn(train.keys(), temp_dir) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") @@ -72,8 +78,6 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): # yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f, Dumper=yaml.SafeDumper) - evaluation_tool_selection = eval_job["options"].get("evaluation_tool_value", "gaia") - logging.info("TOOL: {}".format(evaluation_tool_selection)) if evaluation_tool_selection == "gaia": logging.info("Training GAIA model...") evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) @@ -195,6 +199,22 @@ def lowlevel_data_to_yaml(data): return yaml.dump(data) +def dump_lowlevel_data_sklearn(recordings, location): + """Dumps low-level data to JSON for all recordings into specified location. + + Args: + recordings: List of MBIDs of recordings. + location: Path to directory where low-level data will be saved. + + """ + utils.path.create_path(location) + filelist = {} + for recording in recordings: + filelist[recording] = os.path.join(location, "%s.json" % recording) + with open(filelist[recording], 'w') as outfile: + json.dump(lowlevel_data_to_yaml(db.data.load_low_level(recording)), outfile) + + def extract_recordings(dataset): """Extracts set of recordings used in a dataset.""" recordings = set() From 44c7390bebd02bf5ffd7b262262c8671e5dd4e1f Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 14 Sep 2020 15:56:40 +0300 Subject: [PATCH 21/64] dump recordings low-level in json files --- dataset_eval/evaluate.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 63c16675e..541fd17f8 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -178,6 +178,7 @@ def dump_lowlevel_data(recordings, location): filelist = {} for recording in recordings: filelist[recording] = os.path.join(location, "%s.yaml" % recording) + logging.INFO("RECORDING PATH: {}".format(filelist[recording])) with open(filelist[recording], "w") as f: f.write(lowlevel_data_to_yaml(db.data.load_low_level(recording))) return filelist @@ -212,7 +213,22 @@ def dump_lowlevel_data_sklearn(recordings, location): for recording in recordings: filelist[recording] = os.path.join(location, "%s.json" % recording) with open(filelist[recording], 'w') as outfile: - json.dump(lowlevel_data_to_yaml(db.data.load_low_level(recording)), outfile) + json.dump(lowlevel_data_cleaning(db.data.load_low_level(recording)), outfile) + logging.info("JSON data stored successfully.") + +def lowlevel_data_cleaning(data): + """Prepares dictionary with low-level data about recording for processing. + """ + # Removing descriptors, that will otherwise break gaia_fusion due to + # incompatibility of layouts (see Gaia implementation for more details). + if "tags" in data["metadata"]: + del data["metadata"]["tags"] + if "sample_rate" in data["metadata"]["audio_properties"]: + del data["metadata"]["audio_properties"]["sample_rate"] + if 'lossless' in data['metadata']['audio_properties']: + del data['metadata']['audio_properties']['lossless'] + + return data def extract_recordings(dataset): From ad1114a28e830271c2dbf29ab022352789b2c659 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 14 Sep 2020 16:46:45 +0300 Subject: [PATCH 22/64] adding prints for paths of data storage --- dataset_eval/evaluate.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 541fd17f8..bb3f07a72 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -36,7 +36,9 @@ def main(): logging.info("Starting dataset evaluator...") dataset_dir = current_app.config["DATASET_DIR"] + logging.info("Dataset dir path: {}".format(dataset_dir)) storage_dir = os.path.join(current_app.config["FILE_STORAGE_DIR"], "history") + logging.info("Storage dir path: {}".format(storage_dir)) while True: pending_job = db.dataset_eval.get_next_pending_job(eval_tool_use) if pending_job: @@ -83,7 +85,11 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) elif evaluation_tool_selection == "sklearn": logging.info("Training SKLEARN model...") - evaluate_sklearn(eval_job["options"], eval_location, dataset_dir, storage_dir, eval_job) + evaluate_sklearn(options=eval_job["options"], + eval_location=eval_location, + dataset_dir=temp_dir, + storage_dir=storage_dir, + eval_job=eval_job) db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) @@ -178,7 +184,6 @@ def dump_lowlevel_data(recordings, location): filelist = {} for recording in recordings: filelist[recording] = os.path.join(location, "%s.yaml" % recording) - logging.INFO("RECORDING PATH: {}".format(filelist[recording])) with open(filelist[recording], "w") as f: f.write(lowlevel_data_to_yaml(db.data.load_low_level(recording))) return filelist @@ -211,7 +216,9 @@ def dump_lowlevel_data_sklearn(recordings, location): utils.path.create_path(location) filelist = {} for recording in recordings: + logging.info("Recording: {}".format(recording)) filelist[recording] = os.path.join(location, "%s.json" % recording) + logging.info("Recoding path: {}".format(filelist[recording])) with open(filelist[recording], 'w') as outfile: json.dump(lowlevel_data_cleaning(db.data.load_low_level(recording)), outfile) logging.info("JSON data stored successfully.") @@ -227,7 +234,7 @@ def lowlevel_data_cleaning(data): del data["metadata"]["audio_properties"]["sample_rate"] if 'lossless' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['lossless'] - + # logging.info("Data: {}".format(data)) return data From 840a9c60e6285c9cf0534bbe367bd21c5f13445d Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Mon, 14 Sep 2020 18:07:32 +0300 Subject: [PATCH 23/64] experimenting with the inputs for sklearn --- dataset_eval/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index bb3f07a72..00ec62aa1 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -87,7 +87,7 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): logging.info("Training SKLEARN model...") evaluate_sklearn(options=eval_job["options"], eval_location=eval_location, - dataset_dir=temp_dir, + dataset_dir=dataset_dir, storage_dir=storage_dir, eval_job=eval_job) From a8ae48e8ea06a4acd5c591aaf8d822c205c35f79 Mon Sep 17 00:00:00 2001 From: Alastair Porter Date: Thu, 24 Sep 2020 17:35:20 +0200 Subject: [PATCH 24/64] Rename evaluation_tool to training_tool Also move the selection of the tool to the advanced svm settings --- admin/sql/create_indexes.sql | 2 ++ .../20200924-dataset-eval-job-tool-index.sql | 3 +++ dataset_eval/evaluate.py | 11 +++++------ db/dataset_eval.py | 16 ++++++++-------- webserver/forms.py | 2 +- webserver/templates/datasets/evaluate.html | 16 +++++++--------- webserver/views/datasets.py | 2 +- 7 files changed, 27 insertions(+), 25 deletions(-) create mode 100644 admin/updates/20200924-dataset-eval-job-tool-index.sql diff --git a/admin/sql/create_indexes.sql b/admin/sql/create_indexes.sql index 1d5d681ab..4d71b9dd5 100644 --- a/admin/sql/create_indexes.sql +++ b/admin/sql/create_indexes.sql @@ -26,4 +26,6 @@ CREATE UNIQUE INDEX lower_musicbrainz_id_ndx_user ON "user" (lower(musicbrainz_i CREATE INDEX collected_ndx_statistics ON statistics (collected); +CREATE INDEX training_tool_dataset_eval_jobs ON dataset_eval_jobs((options->>'training_tool')); + COMMIT; diff --git a/admin/updates/20200924-dataset-eval-job-tool-index.sql b/admin/updates/20200924-dataset-eval-job-tool-index.sql new file mode 100644 index 000000000..4667ed757 --- /dev/null +++ b/admin/updates/20200924-dataset-eval-job-tool-index.sql @@ -0,0 +1,3 @@ +BEGIN; +CREATE INDEX training_tool_dataset_eval_jobs ON dataset_eval_jobs((options->>'training_tool')); +COMMIT; \ No newline at end of file diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 00ec62aa1..1ff9acb12 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -56,8 +56,7 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): utils.path.create_path(eval_location) temp_dir = tempfile.mkdtemp() - evaluation_tool_selection = eval_job["options"].get("evaluation_tool_value", "gaia") - logging.info("TOOL: {}".format(evaluation_tool_selection)) + training_tool = eval_job["options"].get("training_tool", "gaia") try: snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"]) @@ -65,13 +64,13 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"]) db.dataset_eval.add_sets_to_job(eval_job["id"], train, test) - if evaluation_tool_selection == "gaia": + if training_tool == "gaia": logging.info("Generating filelist.yaml and copying low-level data for evaluation...") filelist_path = os.path.join(eval_location, "filelist.yaml") filelist = dump_lowlevel_data(train.keys(), temp_dir) with open(filelist_path, "w") as f: yaml.dump(filelist, f) - elif evaluation_tool_selection == "sklearn": + elif training_tool == "sklearn": dump_lowlevel_data_sklearn(train.keys(), temp_dir) logging.info("Generating groundtruth.yaml...") @@ -80,10 +79,10 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): # yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f, Dumper=yaml.SafeDumper) - if evaluation_tool_selection == "gaia": + if training_tool == "gaia": logging.info("Training GAIA model...") evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) - elif evaluation_tool_selection == "sklearn": + elif training_tool == "sklearn": logging.info("Training SKLEARN model...") evaluate_sklearn(options=eval_job["options"], eval_location=eval_location, diff --git a/db/dataset_eval.py b/db/dataset_eval.py index 16a9cb69f..f8723deb8 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -46,7 +46,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_values=None, - preprocessing_values=None, filter_type=None, evaluation_tool_value="gaia"): + preprocessing_values=None, filter_type=None, training_tool="gaia"): """Add dataset into evaluation queue. Args: @@ -67,7 +67,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ filter_type: Optional filtering that will be applied to the dataset. See FILTER_* variables in this module for a list of existing filters. - evaluation_tool_value (optional): A string choice between two strings, gaia or sklearn. + training_tool (optional): The tool to use to train the model (gaia or sklearn) Raises: JobExistsException: if the dataset has already been submitted for evaluation @@ -92,7 +92,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ validate_dataset_contents(db.dataset.get(dataset_id)) return _create_job(connection, dataset_id, normalize, eval_location, c_values, gamma_values, preprocessing_values, filter_type, - evaluation_tool_value) + training_tool) def job_exists(dataset_id): @@ -166,7 +166,7 @@ def validate_dataset_contents(dataset): ) -def get_next_pending_job(evaluation_tool_value="gaia"): +def get_next_pending_job(training_tool="gaia"): """ Get the earliest submitted job which is still in the pending state. @@ -181,11 +181,11 @@ def get_next_pending_job(evaluation_tool_value="gaia"): ON dataset_snapshot.id = dataset_eval_jobs.snapshot_id WHERE status = :status AND eval_location = 'local' - AND options->>'evaluation_tool_value' = :evaluation_tool_value + AND options->>'training_tool' = :training_tool ORDER BY created ASC LIMIT 1 """ % EVAL_COLUMNS_COMMA_SEPARATED) - result = connection.execute(query, {"status": STATUS_PENDING, "evaluation_tool_value": evaluation_tool_value}) + result = connection.execute(query, {"status": STATUS_PENDING, "training_tool": training_tool}) row = result.fetchone() return dict(row) if row else None @@ -333,7 +333,7 @@ def add_dataset_eval_set(connection, data): def _create_job(connection, dataset_id, normalize, eval_location, c_value, - gamma_value, preprocessing_values, filter_type, evaluation_tool_value): + gamma_value, preprocessing_values, filter_type, training_tool): if not isinstance(normalize, bool): raise ValueError("Argument 'normalize' must be a boolean.") if filter_type is not None: @@ -348,7 +348,7 @@ def _create_job(connection, dataset_id, normalize, eval_location, c_value, "c_values": c_value, "gamma_values": gamma_value, "preprocessing_values": preprocessing_values, - "evaluation_tool_value": evaluation_tool_value + "training_tool": training_tool } snapshot_id = db.dataset.create_snapshot(dataset_id) diff --git a/webserver/forms.py b/webserver/forms.py index 4b2b50dd3..eb8db147f 100644 --- a/webserver/forms.py +++ b/webserver/forms.py @@ -62,7 +62,7 @@ class DatasetEvaluationForm(FlaskForm): render_kw={"data-toggle": "collapse", "data-target": "#collapseSvmOptions"}) - evaluation_tool_value = SelectField("What tool do you want to use (sklearn/gaia)", choices=[ + training_tool = SelectField("Model training tool", choices=[ (DATASET_TOOL_EVALUATION_GAIA, "gaia"), (DATASET_TOOL_EVALUATION_SKLEARN, "sklearn")], default=DATASET_TOOL_EVALUATION_GAIA) diff --git a/webserver/templates/datasets/evaluate.html b/webserver/templates/datasets/evaluate.html index 3793915f1..bcaab9cde 100644 --- a/webserver/templates/datasets/evaluate.html +++ b/webserver/templates/datasets/evaluate.html @@ -56,6 +56,12 @@

Evaluate dataset "{{ dataset['name'] }}"

+ {% if config.get('FEATURE_EVAL_TOOL_SELECTION') %} +
+ +
{{ form.training_tool(class="form-control", required="required") }}
+
+ {% endif %}
@@ -85,16 +91,8 @@

Evaluate dataset "{{ dataset['name'] }}"

{{ form.preprocessing_values(required="required") }}
+ {% endif %} - {% if config.get('FEATURE_EVAL_TOOL_SELECTION') %} -
-
- -
{{ form.evaluation_tool_value(class="form-control", required="required") }}
-
-
- {% endif %} -
diff --git a/webserver/views/datasets.py b/webserver/views/datasets.py index 8f5ab45a7..9fb20e8c1 100644 --- a/webserver/views/datasets.py +++ b/webserver/views/datasets.py @@ -253,7 +253,7 @@ def evaluate(dataset_id): gamma_values=gamma_values, preprocessing_values=preprocessing_values, filter_type=form.filter_type.data, - evaluation_tool_value=form.evaluation_tool_value.data + training_tool=form.training_tool.data ) flash.info("Dataset %s has been added into evaluation queue." % ds["id"]) except db.dataset_eval.IncompleteDatasetException as e: From 5426688e836b2726c34c5bbd81f229bc3d11439e Mon Sep 17 00:00:00 2001 From: Alastair Porter Date: Thu, 24 Sep 2020 17:40:28 +0200 Subject: [PATCH 25/64] Fix failing tests --- db/test/test_dataset_eval.py | 42 ++++++++++++++++++------------------ webserver/views/datasets.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/db/test/test_dataset_eval.py b/db/test/test_dataset_eval.py index 794bc4f87..20eb6412a 100644 --- a/db/test/test_dataset_eval.py +++ b/db/test/test_dataset_eval.py @@ -87,7 +87,7 @@ def test_create_job_nonormalize(self): # No dataset normalization job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, False, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -98,7 +98,7 @@ def test_create_job_normalize(self): # dataset normalization job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -109,7 +109,7 @@ def test_create_job_artistfilter(self): # Artist filtering as an option job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, False, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=dataset_eval.FILTER_ARTIST) + filter_type=dataset_eval.FILTER_ARTIST, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -120,7 +120,7 @@ def test_create_job_svm_params(self): # C, gamma, and preprocessing values job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=dataset_eval.FILTER_ARTIST) + filter_type=dataset_eval.FILTER_ARTIST, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -134,27 +134,27 @@ def test_create_job_badfilter(self): with self.assertRaises(ValueError): dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type="test") + filter_type="test", training_tool="gaia") def test_create_job_badlocation(self): # an invalid eval_location with self.assertRaises(ValueError): dataset_eval._create_job(self.conn, self.test_dataset_id, True, "not_a_location", c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") def test_job_exists(self): self.assertFalse(dataset_eval.job_exists(self.test_dataset_id)) dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") self.assertTrue(dataset_eval.job_exists(self.test_dataset_id)) def test_get_job(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") random_id = "f47ac10b-58cc-4372-a567-0e02b2c3d479" # just in case self.assertNotEqual(random_id, job_id) @@ -164,7 +164,7 @@ def test_get_job(self): def test_set_job_result(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") result = { u"accuracy": 1, @@ -182,7 +182,7 @@ def test_set_job_result(self): def test_set_job_status(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertEqual(job["status"], dataset_eval.STATUS_PENDING) @@ -196,12 +196,12 @@ def test_set_job_status(self): def test_get_next_pending_job(self): job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) next_pending = dataset_eval.get_next_pending_job() @@ -218,12 +218,12 @@ def test_get_next_pending_job_remote(self): # If we have a remote pending job with the most recent timestamp, skip it job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) next_pending = dataset_eval.get_next_pending_job() @@ -235,7 +235,7 @@ def test_delete_job(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") snapshots = dataset.get_snapshots_for_dataset(self.test_dataset_id) self.assertEqual(len(snapshots), 1) self.assertIsNotNone(dataset_eval.get_job(job_id)) @@ -247,13 +247,13 @@ def test_delete_job(self): def test_eval_job_location(self): job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) self.assertEqual(job1["eval_location"], dataset_eval.EVAL_REMOTE) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) self.assertEqual(job2["eval_location"], dataset_eval.EVAL_LOCAL) @@ -262,7 +262,7 @@ def test_get_remote_pending_jobs_for_user(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job_details = db.dataset_eval.get_job(job_id) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) @@ -277,7 +277,7 @@ def test_get_pending_jobs_for_user_local(self): """ Check that a local eval dataset for this user doesn't show """ job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job_details = db.dataset_eval.get_job(job_id) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) @@ -290,7 +290,7 @@ def test_get_pending_jobs_for_user_other_user(self): another_dataset_id = dataset.create_from_dict(self.test_data, author_id=another_user_id) job_id = dataset_eval._create_job(self.conn, another_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) self.assertEqual(response, []) @@ -299,7 +299,7 @@ def test_get_pending_jobs_for_user_done(self): """ Check that a remote eval job with a done status doesn't show """ job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") db.dataset_eval.set_job_status(job_id, db.dataset_eval.STATUS_DONE) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) diff --git a/webserver/views/datasets.py b/webserver/views/datasets.py index 9fb20e8c1..a85ebebda 100644 --- a/webserver/views/datasets.py +++ b/webserver/views/datasets.py @@ -127,7 +127,7 @@ def _convert_dataset_to_csv_stringio(dataset): # - dataset description, class names, class descriptions # TODO: On upgrade to python 3, check that stringio accepts the correct data # (may have to change to bytesio if we encode this data) - fp = StringIO.StringIO() + fp = StringIO() writer = csv.writer(fp) # write dataset description only if it is set From 8e083f8232c665dee3cd18fcf305b0159fee82ed Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Thu, 1 Oct 2020 18:55:22 +0300 Subject: [PATCH 26/64] change the location where the .json low-level data is saved to --- dataset_eval/evaluate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 1ff9acb12..fe91c80af 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -71,7 +71,7 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): with open(filelist_path, "w") as f: yaml.dump(filelist, f) elif training_tool == "sklearn": - dump_lowlevel_data_sklearn(train.keys(), temp_dir) + dump_lowlevel_data_sklearn(train.keys(), dataset_dir) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") @@ -222,6 +222,7 @@ def dump_lowlevel_data_sklearn(recordings, location): json.dump(lowlevel_data_cleaning(db.data.load_low_level(recording)), outfile) logging.info("JSON data stored successfully.") + def lowlevel_data_cleaning(data): """Prepares dictionary with low-level data about recording for processing. """ From 053cd113c09a9e4e057580f2be4a0a78f496e398 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 08:59:07 +0200 Subject: [PATCH 27/64] add results params and accuracy, add dataset eval in sklearn cm results --- dataset_eval/evaluate.py | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index fe91c80af..f18a74970 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -140,6 +140,49 @@ def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job) preprocessing_values=options.get("preprocessing_values", []) ) + logging.info("Saving results...") + results = load_best_results_sklearn(exported_path=eval_location, project_file=eval_job["id"]) + db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ + "project_path": eval_location, + "parameters": results["parameters"], + "accuracy": results["accuracy"], + "confusion_matrix": results["confusion_matrix"], + "history_path": results["history_path"], + })) + + +def load_best_results_sklearn(exported_path, project_file): + project_conf_file_path = os.path.join(exported_path, f"{project_file}.yaml") + logging.info(f"Config file path: {project_conf_file_path}") + with open(project_conf_file_path) as fp: + project_data = yaml.load(fp, Loader=yaml.FullLoader) + logging.info(f"Model: {project_data['class_name']}") + + # load the best model dictionary + best_model_path = os.path.join(exported_path, project_file, f"best_model_{project_data['class_name']}.json") + logging.info(f"Best model path: {best_model_path}") + with open(best_model_path) as json_file: + data_best_model = json.load(json_file) + + # load the best model's instances and matrix dictionary + fold_matrix_path = os.path.join(exported_path, project_file, "folded_dataset_instances_cm.json") + logging.info(f"Best Instances and Matrix JSON path: {fold_matrix_path}") + with open(fold_matrix_path) as json_file_cm: + data_fold_matrix = json.load(json_file_cm) + + # load the best model's simplified matrix dictionary + # fold_simplified_matrix_path = os.path.join(exported_path, project_file, "folded_simplified_matrix.json") + # logging.info(f"Best models simplified matrix JSON path: {fold_simplified_matrix_path}") + # with open(fold_simplified_matrix_path) as json_file_simple_cm: + # data_fold_simplified_matrix = json.load(json_file_simple_cm) + + return { + "parameters": data_best_model["params"], + "accuracy": round(data_best_model["score"], 2), + # "confusion_matrix": data_fold_simplified_matrix, + "history_path": "Does not exist because of sklearn training usage" + } + def create_groundtruth_dict(name, datadict): groundtruth = { From 86ac4e0256e9c8a5e8a2fc4db9d5c0a8660b0a85 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 09:29:27 +0200 Subject: [PATCH 28/64] export simplified cm in sklearn model --- dataset_eval/evaluate.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index f18a74970..4b04bbde0 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -22,6 +22,7 @@ is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") if is_sklearn == "1": from models.sklearn.model.classification_project import create_classification_project + from models.sklearn.classification.matrix_creation import simplified_matrix_export eval_tool_use = "sklearn" is_gaia = os.getenv("MODEL_TRAINING_GAIA") @@ -141,7 +142,9 @@ def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job) ) logging.info("Saving results...") - results = load_best_results_sklearn(exported_path=eval_location, project_file=eval_job["id"]) + results = load_best_results_sklearn(exported_path=eval_location, + project_file=eval_job["id"], + exports_directory=eval_job["id"]) db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ "project_path": eval_location, "parameters": results["parameters"], @@ -151,7 +154,7 @@ def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job) })) -def load_best_results_sklearn(exported_path, project_file): +def load_best_results_sklearn(exported_path, project_file, exports_directory): project_conf_file_path = os.path.join(exported_path, f"{project_file}.yaml") logging.info(f"Config file path: {project_conf_file_path}") with open(project_conf_file_path) as fp: @@ -176,10 +179,18 @@ def load_best_results_sklearn(exported_path, project_file): # with open(fold_simplified_matrix_path) as json_file_simple_cm: # data_fold_simplified_matrix = json.load(json_file_simple_cm) + # export the matrix dictionary from the folded dataset + folded_results_matrix_path = os.path.join(exported_path, exports_directory) + simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", + logger=logging, + export_save_path=folded_results_matrix_path, + export_name="simplified_cm.json", + write_mode=False) + return { "parameters": data_best_model["params"], "accuracy": round(data_best_model["score"], 2), - # "confusion_matrix": data_fold_simplified_matrix, + "confusion_matrix": simplified_cm, "history_path": "Does not exist because of sklearn training usage" } From b4dcbb9afc4441aa0da65aedfee139eda8f4c9f3 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 10:44:33 +0200 Subject: [PATCH 29/64] add CM simplified export and best model's results to dataset_eval --- dataset_eval/evaluate.py | 12 +- .../classification/confusion_matrix_export.py | 245 ++++++++++++++++++ models/sklearn/classification/evaluation.py | 189 ++++++++++++-- .../sklearn/classification/matrix_creation.py | 73 ++++++ 4 files changed, 489 insertions(+), 30 deletions(-) create mode 100644 models/sklearn/classification/confusion_matrix_export.py create mode 100644 models/sklearn/classification/matrix_creation.py diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 4b04bbde0..22c9cdf9e 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -155,21 +155,21 @@ def evaluate_sklearn(options, eval_location, dataset_dir, storage_dir, eval_job) def load_best_results_sklearn(exported_path, project_file, exports_directory): - project_conf_file_path = os.path.join(exported_path, f"{project_file}.yaml") - logging.info(f"Config file path: {project_conf_file_path}") + project_conf_file_path = os.path.join(exported_path, "{}.yaml".format(project_file)) + logging.info("Config file path: {}".format(project_conf_file_path)) with open(project_conf_file_path) as fp: project_data = yaml.load(fp, Loader=yaml.FullLoader) - logging.info(f"Model: {project_data['class_name']}") + logging.info("Model: {}".format(project_data['class_name'])) # load the best model dictionary - best_model_path = os.path.join(exported_path, project_file, f"best_model_{project_data['class_name']}.json") - logging.info(f"Best model path: {best_model_path}") + best_model_path = os.path.join(exported_path, project_file, "best_model_{}.json".format(project_data['class_name'])) + logging.info("Best model path: {}".format(best_model_path)) with open(best_model_path) as json_file: data_best_model = json.load(json_file) # load the best model's instances and matrix dictionary fold_matrix_path = os.path.join(exported_path, project_file, "folded_dataset_instances_cm.json") - logging.info(f"Best Instances and Matrix JSON path: {fold_matrix_path}") + logging.info("Best Instances and Matrix JSON path: {}".format(fold_matrix_path)) with open(fold_matrix_path) as json_file_cm: data_fold_matrix = json.load(json_file_cm) diff --git a/models/sklearn/classification/confusion_matrix_export.py b/models/sklearn/classification/confusion_matrix_export.py new file mode 100644 index 000000000..3822e4403 --- /dev/null +++ b/models/sklearn/classification/confusion_matrix_export.py @@ -0,0 +1,245 @@ +# encoding: utf-8 +from collections import defaultdict +import json +from math import sqrt + + +class ConfusionMatrixCreation: + + def __init__(self): + self.matrix = defaultdict(lambda: defaultdict(list)) + self.folds = dict() + + def load(self, filename): + with open(filename) as f: + data = json.load(f) + # print(data) + + # convert to a defaultdict the data we just loaded + self.matrix = defaultdict(lambda: defaultdict(list)) + for k, v in data['matrix'].items(): + self.matrix[k] = defaultdict(list, v) + # print(self.matrix[k]) + + if "fold" in data: + self.folds = data['fold'] + + def save(self, filename): + # convert to "normal" dicts before saving + data = { + 'matrix': dict((k, dict(v)) for k, v in self.matrix.items()), + 'fold': self.folds + } + # with open(filename, 'w') as f: + # yaml.dump(data, f) + + with open(filename, 'w') as f: + json.dump(data, f) + + def add(self, expected, predicted, name=''): + self.matrix[expected][predicted] += [name] + + def addNfold(self, expected, predicted, name, nfold): + self.matrix[expected][predicted] += [name] + self.folds[name] = nfold + + def matrixNfold(self, nfold): + nfoldDict = defaultdict(lambda: defaultdict(list)) + for c in self.matrix: + for d in self.matrix[c]: + for e in self.matrix[c][d]: + if self.folds[e] == nfold: + nfoldDict[c][d].append(e) + return nfoldDict + + def stdNfold(self, normalizedAccuracies=False): + """Return standard deviation of the accuracies across folds.""" + + if normalizedAccuracies: + accuracies = self.normalizedAccuraciesNFold() + else: + accuracies = self.accuraciesNFold() + + # TODO the following lines compute standard deviation. In + # the future we can use stdev method from the statistics + # package, shipped by default since Python 3.4 + acc_mean = sum(accuracies) / len(accuracies) + + return sqrt(sum([(x - acc_mean) * (x - acc_mean) + for x in accuracies]) / len(accuracies)) + + def classes(self): + allClasses = set() + + for c in self.matrix: + allClasses.add(c) + for d in self.matrix[c]: + allClasses.add(d) + + return allClasses + + def total(self): + """Return the total number of classification instances.""" + result = 0 + for c in self.matrix: + for d in self.matrix[c]: + result += len(self.matrix[c][d]) + return result + + def totalNfold(self, fold): + """Return the total number of classification instances for a given fold.""" + matrix = self.matrixNfold(fold) + result = 0 + for c in matrix: + for d in matrix[c]: + result += len(matrix[c][d]) + return result + + def correct(self): + """Return the number of correctly classified instances.""" + result = 0 + for c in self.matrix: + result += len(self.matrix[c][c]) + return result + + def correctNfold(self, fold): + """Return the number of correctly classified instances for a given fold.""" + matrix = self.matrixNfold(fold) + result = 0 + for c in matrix: + result += len(matrix[c][c]) + return result + + def toDict(self): + """Format nicely the confusion matrix as normal dict, replace list of + instances by number of them.""" + allClasses = self.classes() + + # build resulting dict + result = {} + for c in allClasses: + result[c] = {} + for d in allClasses: + result[c][d] = len(self.matrix[c][d]) + + return result + + def results(self): + good = self.correct() + total = self.total() + return 'Correctly classified: %d out of %d (%d%%)' % (good, total, 100*good//total) + + def accuraciesNFold(self): + '''Return accuracies per fold.''' + folds = set(self.folds.values()) + + if not bool(folds): + raise('This matrix does not contain information about folds') + + return [self.correctNfold(f) * 100. / self.totalNfold(f) + for f in folds] + + def normalizedAccuraciesNFold(self): + '''Returns the normalized accuracy.''' + folds = set(self.folds.values()) + + if not bool(folds): + raise('This matrix does not contain information about folds') + + foldAccuracies = [] + + for f in folds: + classAccuracies = [] + matrix = self.matrixNfold(f) + + for c in matrix: + classElements = 0 + for e in matrix[c]: + classElements += len(matrix[c][e]) + + classAccuracies.append(len(matrix[c][c]) * 100. / classElements) + + foldAccuracies.append(sum(classAccuracies) / len(classAccuracies)) + + return foldAccuracies + + def accuracy(self): + accuracies = self.accuraciesNFold() + return sum(accuracies) / len(accuracies) + + def normalizedAccuracy(self): + accuracies = self.normalizedAccuraciesNFold() + return sum(accuracies) / len(accuracies) + + def toHtml(self, standAlone = True, embedStyleSheet = True): + html = '' + html += '' + html += '' + html += '' + html += '' + html += '' + html += '' + html += '' + html += '' + html += '

Predicted (%)

' + html += '' + + html += '' + + labels = sorted(self.classes()) + for predicted in labels: + html += '' + + html += '' + html += '' + html += '' + + for actual in labels: + html += '' + html += '' + + classInstances = 0 + for predicted in self.matrix[actual].values(): + classInstances += len(predicted) + + proportion = 100.0 * classInstances / self.total() + + for predicted in labels: + correct = len(self.matrix[actual][predicted]) + if classInstances: + percentage = correct * 100.0 / classInstances + else: + percentage = 0 + + if actual == predicted: + if percentage > 0: + html += '' % (percentage, correct, actual, classInstances, predicted) + html += '' + html += '" + html += '' + html += '
'+predicted+'Proportion
' + actual + '' + else: + html += '' + else: + if percentage >= 10: # 10%, hard-coded value + html += '' + else: + html += '' + + html += '%.2f %d %s (out of %d) classified as %s' + actual + '' + "%.2f" % proportion + " %

Actual (%)

' + html += '
' + + + if standAlone: + if embedStyleSheet: + html = ''' + + + ''' + html + '' + else: + html = ''' + + + ''' + html + '' + + return html diff --git a/models/sklearn/classification/evaluation.py b/models/sklearn/classification/evaluation.py index 082baef99..299fe73b8 100644 --- a/models/sklearn/classification/evaluation.py +++ b/models/sklearn/classification/evaluation.py @@ -15,6 +15,7 @@ from ..transformation.transform import Transform from ..classification.report_files_export import export_report from ..helper_functions.logging_tool import LoggerSetup +from ..classification.matrix_creation import matrix_creation, simplified_matrix_export def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, log_level): @@ -42,10 +43,10 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, images_path = FindCreateDirectory(exports_path, os.path.join(exports_dir, "images")).inspect_directory() - # load best model - load_model_params_path = os.path.join(exports_path, exports_dir, "best_model_{}.json".format(class_name)) - with open(load_model_params_path) as model_params_file: - model_params_data = json.load(model_params_file) + # load best model params and score data + load_best_model_params_score_path = os.path.join(exports_path, exports_dir, "best_model_{}.json".format(class_name)) + with open(load_best_model_params_score_path) as model_params_score_file: + best_params_score_data = json.load(model_params_score_file) logger.info("Best model preprocessing step: {}".format(process)) # load the saved classifier @@ -89,7 +90,24 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, dataset_path=dataset_path, logger=logger) - # ACCURACIES in each fold + logger.debug("PRINT THE WHOLE GESTURES DF:\n{}".format(df_predictions)) + + # list of each column from the dataframe for the folded indexed tracks, y, adn predictions + tracks_folded_list = df_predictions["track"].to_list() + y_folded_list = df_predictions[class_name].to_list() + pred_folded_list = df_predictions["predictions"].to_list() + + # export the matrix dictionary from the folded dataset + folded_results_matrix_path = os.path.join(exports_path, exports_dir) + folded_matrix_dict = matrix_creation(classes=clf.classes_, + tracks=tracks_folded_list, + y_actual=y_folded_list, + y_hat=pred_folded_list, + logger=logger, + export_save_path=folded_results_matrix_path, + export_name="folded_dataset_results_matrix.json") + + # ACCURACIES for each fold export_accuracies(accuracy_model=accuracy_model, config=config, class_name=class_name, @@ -97,11 +115,26 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, images_path=images_path, logger=logger) - # Folded Tracks Dictionary - export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, - class_name=class_name, - dataset_path=dataset_path, - logger=logger) + # Folded Tracks Dictionary --> export also the Folded instances dictionary + folded_instances_dict = export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, + class_name=class_name, + dataset_path=dataset_path, + logger=logger) + + concat_save_model_instances_matrix_json(instances_dict=folded_instances_dict, + cm_dict=folded_matrix_dict, + exports_path=exports_path, + exports_dir=exports_dir, + logger=logger, + export_name="folded_dataset_instances_cm.json") + + simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", + logger=logger, + export_save_path=folded_results_matrix_path, + export_name="folded_simplified_matrix.json", + write_mode=True) + + logger.info("Simplified CM of the evaluated folded dataset:\n{}".format(simplified_cm)) # Evaluation to the folded Dataset export_evaluation_results(config=config, @@ -113,7 +146,7 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger=logger ) - # Train to the whole dataset + # ---------- TRAIN TO THE WHOLE DATASET WITH THE BEST CLASSIFIER ---------- logger.info("Train the classifier with the whole dataset..") clf.fit(features_prepared, y) # prediction for the whole dataset @@ -122,6 +155,29 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, best_model_path = os.path.join(exports_path, exports_dir, "best_clf_model.pkl") joblib.dump(clf, best_model_path) logger.info("Best model saved.") + + # export the matrix dictionary from the whole dataset + whole_results_matrix_path = os.path.join(exports_path, exports_dir) + whole_matrix_dict = matrix_creation(classes=clf.classes_, + tracks=tracks, + y_actual=predictions_all, + y_hat=y, + logger=logger, + export_save_path=whole_results_matrix_path, + export_name="whole_dataset_results_matrix.json") + + matrix_export(best_result_file="whole_dataset_results_matrix.json", + logger=logger, + export_save_path=whole_results_matrix_path, + export_name="whole_dataset_cm_dict.json") + + concat_save_model_instances_matrix_json(instances_dict=None, + cm_dict=whole_matrix_dict, + exports_path=exports_path, + exports_dir=exports_dir, + logger=logger, + export_name="whole_dataset_instances_cm.json") + # Evaluation to the whole Dataset export_evaluation_results(config=config, set_name="Whole", @@ -133,7 +189,56 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, ) +def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, exports_dir, logger, export_name): + """ + Save the best model's folded instances and confusion matrix dictionary merged into one dictionary + + Args: + instances_dict: + cm_dict: + exports_path: + exports_dir: + logger: + export_name: + + Returns: + + """ + best_folds_cm_merge_dict_path = os.path.join(exports_path, exports_dir) + + if instances_dict: + # in case of the folded dataset where folds exist + best_folds_cm_merge_dict = {**instances_dict, **cm_dict} + else: + # in case of the whole datset where no folds exist + best_folds_cm_merge_dict = cm_dict + + # Serializing json + json_object_folds_cm = json.dumps(best_folds_cm_merge_dict, indent=4) + # Writing to json + load_file_path = os.path.join(best_folds_cm_merge_dict_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object_folds_cm) + logger.info("Whole folded instaces and matrix dictionary stored successfully.") + + def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logger): + """ + + Args: + clf: the classifier model object + inner_cv: the KFold object + feats_prepared: + y: the true values + tracks: + class_name: + logger: + + Returns: + tracks_fold_indexing_dict: + accuracy_model: + predictions_df_list: + """ tracks_fold_indexing_dict = {} accuracy_model = [] predictions_df_list = [] @@ -158,14 +263,14 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logge clf.fit(X_train, y_train) logger.debug("Classifier classes: {}".format(clf.classes_)) # create a df for this fold with the predictions - df_pred_general = fold_predictions(clf=clf, - class_name=class_name, - X_test=X_test, - test_index=test_index, - tracks_list=tracks_list, - y_test=y_test, - logger=logger) - # Append the folded dataset to a list that will contain all the folded datasets: + df_pred_general = create_fold_predictions(clf=clf, + class_name=class_name, + X_test=X_test, + test_index=test_index, + tracks_list=tracks_list, + y_test=y_test, + logger=logger) + # Append the folded dataset to a list that will contain all the folded datasets predictions_df_list.append(df_pred_general) # Append each accuracy of the folded model to a list that contains all the accuracies resulted from each fold accuracy_model.append(accuracy_score(y_test, clf.predict(X_test), normalize=True) * 100) @@ -174,7 +279,7 @@ def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logge return predictions_df_list, accuracy_model, tracks_fold_indexing_dict -def fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): +def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): """ Creates a pandas DataFrame from each fold with the predictions in order later to extract the shuffled dataset with the tracks, the percentage @@ -215,6 +320,19 @@ def fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, l def export_accuracies(accuracy_model, config, class_name, exports_path, images_path, logger): + """ + + Args: + accuracy_model: + config: + class_name: + exports_path: + images_path: + logger: + + Returns: + + """ logger.info("Accuracies in each fold: {}".format(accuracy_model)) logger.info("Mean of accuracies: {}".format(np.mean(accuracy_model))) logger.info("Standard Deviation of accuracies: {}".format(np.std(accuracy_model))) @@ -234,6 +352,16 @@ def export_accuracies(accuracy_model, config, class_name, exports_path, images_p def create_dataset_predictions(list_df_predictions, class_name, dataset_path, logger): + """ + Args: + list_df_predictions: + class_name: + dataset_path: + logger: + + Returns: + + """ logger.info("Make Predictions DataFrame for all the folded instances together.") df_concat_predictions = pd.concat(list_df_predictions) logger.debug("\n{}".format(df_concat_predictions.head())) @@ -266,10 +394,23 @@ def create_accuracies_dist_plot(accuracies_list, images_path, logger): def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, logger): logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) - folded_dataset_path = os.path.join(dataset_path, "{}.yaml".format(class_name)) - with open(folded_dataset_path, 'w') as file: - folded_dataset = yaml.dump(tracks_fold_indexing_dict, file) - logger.info("Folded dataset written successfully to disk.") + fold_dict = {"fold": tracks_fold_indexing_dict} + + # writing to yaml + folded_dataset_path_yml = os.path.join(dataset_path, "{}.yaml".format(class_name)) + with open(folded_dataset_path_yml, 'w') as file: + folded_dataset = yaml.dump(fold_dict, file) + + # Serializing json + json_object = json.dumps(fold_dict, indent=4) + # Writing to json + folded_dataset_path_json = os.path.join(dataset_path, "{}.json".format(class_name)) + with open(folded_dataset_path_json, "w") as outfile: + outfile.write(json_object) + + logger.info("Folded dataset written successfully to disk both in yaml and json format.") + + return fold_dict def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path, logger): diff --git a/models/sklearn/classification/matrix_creation.py b/models/sklearn/classification/matrix_creation.py new file mode 100644 index 000000000..8732a9f9f --- /dev/null +++ b/models/sklearn/classification/matrix_creation.py @@ -0,0 +1,73 @@ +import os +import json +import numpy as np +from ..classification.confusion_matrix_export import ConfusionMatrixCreation + + +def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, export_name): + logger.info("MATRIX DICTIONARY CREATION") + # classes numpy array to list conversion + logger.info("CLASSES BEFORE CONVERSION {}".format(type(classes))) + classes = classes.tolist() + logger.info("CLASSES AFTER CONVERSION: {}".format(type(classes))) + logger.info("CLASSES: {}".format(classes)) + matrix_dict = {} + # print(type(y_actual)) + # print(type(y_hat)) + for pred_class in classes: + logger.info("Class process: {}".format(pred_class)) + # print("Class type:", type(pred_class)) + # pred_class = str(pred_class) + class_item_dict = {} + for track, actual, pred in zip(tracks, y_actual, y_hat): + if isinstance(actual, (int, np.int64)): + actual = int(actual) + if isinstance(pred, (int, np.int64)): + pred = int(pred) + if pred_class == actual == pred: + if actual not in class_item_dict: + class_item_dict[actual] = [] + class_item_dict[actual].append(track) + elif pred_class == actual and actual != pred: + if pred not in class_item_dict: + class_item_dict[pred] = [] + class_item_dict[pred].append(track) + matrix_dict[pred_class] = class_item_dict + logger.info("Matrix classified..") + matrix_general_dict = {"matrix": matrix_dict} + logger.debug("The whole matrix dictionary:\n{}".format(matrix_general_dict)) + + # Serializing json + json_object = json.dumps(matrix_general_dict, indent=4) + # Writing to sample.json + load_file_path = os.path.join(export_save_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object) + logger.info("Best results matrix stored successfully.") + + return matrix_general_dict + + +def simplified_matrix_export(best_result_file, logger, export_save_path, export_name, write_mode=False): + cm = ConfusionMatrixCreation() + load_file_path = os.path.join(export_save_path, best_result_file) + # best model data load from JSON + logger.info("load best model results from JSON format file") + cm.load(load_file_path) + logger.info("Best model results loaded..") + simplified_cm = {} + for key, val in cm.matrix.items(): + simplified_cm[key] = {} + for predicted_key, predicted_val in val.items(): + simplified_cm[key][predicted_key] = len(predicted_val) + # export simplified matrix to JSON file + if write_mode is True: + # Serializing json + json_object = json.dumps(simplified_cm, indent=4) + # Writing to sample.json + load_file_path = os.path.join(export_save_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object) + logger.info("Best simplified matrix stored successfully.") + + return simplified_cm From 5198443880a9ce1f3aad0252f9c61e89e9156653 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 10:46:56 +0200 Subject: [PATCH 30/64] delete unnecessary functionalities from load_best_results_sklearn() --- dataset_eval/evaluate.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 22c9cdf9e..42e33cea2 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -167,18 +167,6 @@ def load_best_results_sklearn(exported_path, project_file, exports_directory): with open(best_model_path) as json_file: data_best_model = json.load(json_file) - # load the best model's instances and matrix dictionary - fold_matrix_path = os.path.join(exported_path, project_file, "folded_dataset_instances_cm.json") - logging.info("Best Instances and Matrix JSON path: {}".format(fold_matrix_path)) - with open(fold_matrix_path) as json_file_cm: - data_fold_matrix = json.load(json_file_cm) - - # load the best model's simplified matrix dictionary - # fold_simplified_matrix_path = os.path.join(exported_path, project_file, "folded_simplified_matrix.json") - # logging.info(f"Best models simplified matrix JSON path: {fold_simplified_matrix_path}") - # with open(fold_simplified_matrix_path) as json_file_simple_cm: - # data_fold_simplified_matrix = json.load(json_file_simple_cm) - # export the matrix dictionary from the folded dataset folded_results_matrix_path = os.path.join(exported_path, exports_directory) simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", From bc9a247932532fbc31aeda8075b4833a7947a1c6 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 10:57:04 +0200 Subject: [PATCH 31/64] previous setup to check --- dataset_eval/evaluate.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 42e33cea2..22c9cdf9e 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -167,6 +167,18 @@ def load_best_results_sklearn(exported_path, project_file, exports_directory): with open(best_model_path) as json_file: data_best_model = json.load(json_file) + # load the best model's instances and matrix dictionary + fold_matrix_path = os.path.join(exported_path, project_file, "folded_dataset_instances_cm.json") + logging.info("Best Instances and Matrix JSON path: {}".format(fold_matrix_path)) + with open(fold_matrix_path) as json_file_cm: + data_fold_matrix = json.load(json_file_cm) + + # load the best model's simplified matrix dictionary + # fold_simplified_matrix_path = os.path.join(exported_path, project_file, "folded_simplified_matrix.json") + # logging.info(f"Best models simplified matrix JSON path: {fold_simplified_matrix_path}") + # with open(fold_simplified_matrix_path) as json_file_simple_cm: + # data_fold_simplified_matrix = json.load(json_file_simple_cm) + # export the matrix dictionary from the folded dataset folded_results_matrix_path = os.path.join(exported_path, exports_directory) simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", From 72e1d7ff48639eb2aa5bba0b33522eb527562ad4 Mon Sep 17 00:00:00 2001 From: Pantelis Tzamalis Date: Tue, 1 Dec 2020 23:27:16 +0200 Subject: [PATCH 32/64] simplified_matrix_export() reference correction --- models/sklearn/classification/evaluation.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/models/sklearn/classification/evaluation.py b/models/sklearn/classification/evaluation.py index 299fe73b8..3cc78881c 100644 --- a/models/sklearn/classification/evaluation.py +++ b/models/sklearn/classification/evaluation.py @@ -166,10 +166,13 @@ def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, export_save_path=whole_results_matrix_path, export_name="whole_dataset_results_matrix.json") - matrix_export(best_result_file="whole_dataset_results_matrix.json", - logger=logger, - export_save_path=whole_results_matrix_path, - export_name="whole_dataset_cm_dict.json") + simplified_cm_whole = simplified_matrix_export(best_result_file="whole_dataset_results_matrix.json", + logger=logger, + export_save_path=whole_results_matrix_path, + export_name="whole_dataset_cm_dict.json", + write_mode=True) + + logger.info("Simplified CM of the evaluated whole dataset:\n{}".format(simplified_cm_whole)) concat_save_model_instances_matrix_json(instances_dict=None, cm_dict=whole_matrix_dict, From 62ab8ec76fc08f815904cc9bcd03b3a2caa999c1 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 29 Jun 2021 17:13:45 +0530 Subject: [PATCH 33/64] Move models to acousticbrainz package --- {models => acousticbrainz}/__init__.py | 0 {models/sklearn => acousticbrainz/models}/__init__.py | 0 {models => acousticbrainz/models}/sklearn/README.md | 0 .../classification => acousticbrainz/models/sklearn}/__init__.py | 0 .../models/sklearn/classification}/__init__.py | 0 .../models}/sklearn/classification/classification_task.py | 0 .../sklearn/classification/classification_task_manager.py | 0 .../models}/sklearn/classification/classifier_basic.py | 0 .../models}/sklearn/classification/classifier_grid.py | 0 .../models}/sklearn/classification/confusion_matrix_export.py | 0 .../models}/sklearn/classification/evaluation.py | 0 .../models}/sklearn/classification/matrix_creation.py | 0 .../models}/sklearn/classification/report_files_export.py | 0 .../models}/sklearn/classification/train_class.py | 0 .../models/sklearn/helper_functions}/__init__.py | 0 .../models}/sklearn/helper_functions/logging_tool.py | 0 .../models}/sklearn/helper_functions/utils.py | 0 .../models/sklearn/model}/__init__.py | 0 .../models}/sklearn/model/classification_project.py | 0 .../models}/sklearn/model/configuration_template.yaml | 0 {models => acousticbrainz/models}/sklearn/model/predict.py | 0 {models => acousticbrainz/models}/sklearn/requirements.txt | 0 acousticbrainz/models/sklearn/transformation/__init__.py | 1 + .../models}/sklearn/transformation/load_ground_truth.py | 0 .../models}/sklearn/transformation/load_low_level.py | 0 .../models}/sklearn/transformation/transform.py | 0 .../models}/sklearn/transformation/transform_predictions.py | 0 .../models}/sklearn/transformation/utils_preprocessing.py | 0 28 files changed, 1 insertion(+) rename {models => acousticbrainz}/__init__.py (100%) rename {models/sklearn => acousticbrainz/models}/__init__.py (100%) rename {models => acousticbrainz/models}/sklearn/README.md (100%) rename {models/sklearn/classification => acousticbrainz/models/sklearn}/__init__.py (100%) rename {models/sklearn/helper_functions => acousticbrainz/models/sklearn/classification}/__init__.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/classification_task.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/classification_task_manager.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/classifier_basic.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/classifier_grid.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/confusion_matrix_export.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/evaluation.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/matrix_creation.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/report_files_export.py (100%) rename {models => acousticbrainz/models}/sklearn/classification/train_class.py (100%) rename {models/sklearn/model => acousticbrainz/models/sklearn/helper_functions}/__init__.py (100%) rename {models => acousticbrainz/models}/sklearn/helper_functions/logging_tool.py (100%) rename {models => acousticbrainz/models}/sklearn/helper_functions/utils.py (100%) rename {models/sklearn/transformation => acousticbrainz/models/sklearn/model}/__init__.py (100%) rename {models => acousticbrainz/models}/sklearn/model/classification_project.py (100%) rename {models => acousticbrainz/models}/sklearn/model/configuration_template.yaml (100%) rename {models => acousticbrainz/models}/sklearn/model/predict.py (100%) rename {models => acousticbrainz/models}/sklearn/requirements.txt (100%) create mode 100644 acousticbrainz/models/sklearn/transformation/__init__.py rename {models => acousticbrainz/models}/sklearn/transformation/load_ground_truth.py (100%) rename {models => acousticbrainz/models}/sklearn/transformation/load_low_level.py (100%) rename {models => acousticbrainz/models}/sklearn/transformation/transform.py (100%) rename {models => acousticbrainz/models}/sklearn/transformation/transform_predictions.py (100%) rename {models => acousticbrainz/models}/sklearn/transformation/utils_preprocessing.py (100%) diff --git a/models/__init__.py b/acousticbrainz/__init__.py similarity index 100% rename from models/__init__.py rename to acousticbrainz/__init__.py diff --git a/models/sklearn/__init__.py b/acousticbrainz/models/__init__.py similarity index 100% rename from models/sklearn/__init__.py rename to acousticbrainz/models/__init__.py diff --git a/models/sklearn/README.md b/acousticbrainz/models/sklearn/README.md similarity index 100% rename from models/sklearn/README.md rename to acousticbrainz/models/sklearn/README.md diff --git a/models/sklearn/classification/__init__.py b/acousticbrainz/models/sklearn/__init__.py similarity index 100% rename from models/sklearn/classification/__init__.py rename to acousticbrainz/models/sklearn/__init__.py diff --git a/models/sklearn/helper_functions/__init__.py b/acousticbrainz/models/sklearn/classification/__init__.py similarity index 100% rename from models/sklearn/helper_functions/__init__.py rename to acousticbrainz/models/sklearn/classification/__init__.py diff --git a/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py similarity index 100% rename from models/sklearn/classification/classification_task.py rename to acousticbrainz/models/sklearn/classification/classification_task.py diff --git a/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py similarity index 100% rename from models/sklearn/classification/classification_task_manager.py rename to acousticbrainz/models/sklearn/classification/classification_task_manager.py diff --git a/models/sklearn/classification/classifier_basic.py b/acousticbrainz/models/sklearn/classification/classifier_basic.py similarity index 100% rename from models/sklearn/classification/classifier_basic.py rename to acousticbrainz/models/sklearn/classification/classifier_basic.py diff --git a/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py similarity index 100% rename from models/sklearn/classification/classifier_grid.py rename to acousticbrainz/models/sklearn/classification/classifier_grid.py diff --git a/models/sklearn/classification/confusion_matrix_export.py b/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py similarity index 100% rename from models/sklearn/classification/confusion_matrix_export.py rename to acousticbrainz/models/sklearn/classification/confusion_matrix_export.py diff --git a/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py similarity index 100% rename from models/sklearn/classification/evaluation.py rename to acousticbrainz/models/sklearn/classification/evaluation.py diff --git a/models/sklearn/classification/matrix_creation.py b/acousticbrainz/models/sklearn/classification/matrix_creation.py similarity index 100% rename from models/sklearn/classification/matrix_creation.py rename to acousticbrainz/models/sklearn/classification/matrix_creation.py diff --git a/models/sklearn/classification/report_files_export.py b/acousticbrainz/models/sklearn/classification/report_files_export.py similarity index 100% rename from models/sklearn/classification/report_files_export.py rename to acousticbrainz/models/sklearn/classification/report_files_export.py diff --git a/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py similarity index 100% rename from models/sklearn/classification/train_class.py rename to acousticbrainz/models/sklearn/classification/train_class.py diff --git a/models/sklearn/model/__init__.py b/acousticbrainz/models/sklearn/helper_functions/__init__.py similarity index 100% rename from models/sklearn/model/__init__.py rename to acousticbrainz/models/sklearn/helper_functions/__init__.py diff --git a/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py similarity index 100% rename from models/sklearn/helper_functions/logging_tool.py rename to acousticbrainz/models/sklearn/helper_functions/logging_tool.py diff --git a/models/sklearn/helper_functions/utils.py b/acousticbrainz/models/sklearn/helper_functions/utils.py similarity index 100% rename from models/sklearn/helper_functions/utils.py rename to acousticbrainz/models/sklearn/helper_functions/utils.py diff --git a/models/sklearn/transformation/__init__.py b/acousticbrainz/models/sklearn/model/__init__.py similarity index 100% rename from models/sklearn/transformation/__init__.py rename to acousticbrainz/models/sklearn/model/__init__.py diff --git a/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py similarity index 100% rename from models/sklearn/model/classification_project.py rename to acousticbrainz/models/sklearn/model/classification_project.py diff --git a/models/sklearn/model/configuration_template.yaml b/acousticbrainz/models/sklearn/model/configuration_template.yaml similarity index 100% rename from models/sklearn/model/configuration_template.yaml rename to acousticbrainz/models/sklearn/model/configuration_template.yaml diff --git a/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py similarity index 100% rename from models/sklearn/model/predict.py rename to acousticbrainz/models/sklearn/model/predict.py diff --git a/models/sklearn/requirements.txt b/acousticbrainz/models/sklearn/requirements.txt similarity index 100% rename from models/sklearn/requirements.txt rename to acousticbrainz/models/sklearn/requirements.txt diff --git a/acousticbrainz/models/sklearn/transformation/__init__.py b/acousticbrainz/models/sklearn/transformation/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py similarity index 100% rename from models/sklearn/transformation/load_ground_truth.py rename to acousticbrainz/models/sklearn/transformation/load_ground_truth.py diff --git a/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py similarity index 100% rename from models/sklearn/transformation/load_low_level.py rename to acousticbrainz/models/sklearn/transformation/load_low_level.py diff --git a/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py similarity index 100% rename from models/sklearn/transformation/transform.py rename to acousticbrainz/models/sklearn/transformation/transform.py diff --git a/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py similarity index 100% rename from models/sklearn/transformation/transform_predictions.py rename to acousticbrainz/models/sklearn/transformation/transform_predictions.py diff --git a/models/sklearn/transformation/utils_preprocessing.py b/acousticbrainz/models/sklearn/transformation/utils_preprocessing.py similarity index 100% rename from models/sklearn/transformation/utils_preprocessing.py rename to acousticbrainz/models/sklearn/transformation/utils_preprocessing.py From 21ce596311652c6387ba3b9e73389315f54339d0 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 30 Jun 2021 13:17:30 +0530 Subject: [PATCH 34/64] Fix location of sklearn module in Dockerfile --- Dockerfile.py3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.py3 b/Dockerfile.py3 index 5460cf91f..7e6eaa9af 100644 --- a/Dockerfile.py3 +++ b/Dockerfile.py3 @@ -54,8 +54,8 @@ COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.t RUN pip install --no-cache-dir -r requirements.txt # Python dependencies for sklearn -COPY --chown=acousticbrainz:acousticbrainz models/sklearn/requirements.txt /code/models/sklearn/requirements.txt -RUN pip install --no-cache-dir -r /code/models/sklearn/requirements.txt +COPY --chown=acousticbrainz:acousticbrainz acousticbrainz/models/sklearn/requirements.txt /code/acousticbrainz/models/sklearn/requirements.txt +RUN pip install --no-cache-dir -r /code/acousticbrainz/models/sklearn/requirements.txt FROM acousticbrainz-sklearn AS acousticbrainz-dev From eed4b2c3e1cf233c86b8390773aa05a7732504b4 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 30 Jun 2021 14:00:35 +0530 Subject: [PATCH 35/64] Fix sklearn imports --- dataset_eval/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 22c9cdf9e..1dc057131 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -21,8 +21,8 @@ eval_tool_use = "gaia" is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") if is_sklearn == "1": - from models.sklearn.model.classification_project import create_classification_project - from models.sklearn.classification.matrix_creation import simplified_matrix_export + from acousticbrainz.models.sklearn.model.classification_project import create_classification_project + from acousticbrainz.models.sklearn.classification.matrix_creation import simplified_matrix_export eval_tool_use = "sklearn" is_gaia = os.getenv("MODEL_TRAINING_GAIA") From 89d327620644eddd487c27d35b676f43eb455100 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 30 Jun 2021 14:52:25 +0530 Subject: [PATCH 36/64] Add failOnUnmatched: False to gaia project file --- dataset_eval/gaia_wrapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataset_eval/gaia_wrapper.py b/dataset_eval/gaia_wrapper.py index c841a7ec2..42fd0699c 100644 --- a/dataset_eval/gaia_wrapper.py +++ b/dataset_eval/gaia_wrapper.py @@ -73,6 +73,8 @@ def update_parameters(project_file, c_values, gamma_values, preprocessing_values pref['gamma'] = gamma_values if preprocessing_values: pref['preprocessing'] = preprocessing_values + # Temporarily disable to avoid errors while training to avoid didn't match any descriptor error + project['failOnUnmatched'] = False with open(project_file, "w") as pfile: yaml.dump(project, pfile) From 0ff83db47c48d8bd2f85494cdd008be4dde00f38 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 30 Jun 2021 14:59:59 +0530 Subject: [PATCH 37/64] Revert "Add failOnUnmatched: False to gaia project file" This reverts commit 89d327620644eddd487c27d35b676f43eb455100. --- dataset_eval/gaia_wrapper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataset_eval/gaia_wrapper.py b/dataset_eval/gaia_wrapper.py index 42fd0699c..c841a7ec2 100644 --- a/dataset_eval/gaia_wrapper.py +++ b/dataset_eval/gaia_wrapper.py @@ -73,8 +73,6 @@ def update_parameters(project_file, c_values, gamma_values, preprocessing_values pref['gamma'] = gamma_values if preprocessing_values: pref['preprocessing'] = preprocessing_values - # Temporarily disable to avoid errors while training to avoid didn't match any descriptor error - project['failOnUnmatched'] = False with open(project_file, "w") as pfile: yaml.dump(project, pfile) From 35aafb47feaed884216538af9b436d146d4a716a Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 30 Jun 2021 22:50:55 +0530 Subject: [PATCH 38/64] Do not delete temp files for debugging --- dataset_eval/evaluate.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 1dc057131..38af1033d 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -55,7 +55,8 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"]) utils.path.create_path(eval_location) - temp_dir = tempfile.mkdtemp() + temp_dir = os.path.join(eval_location, 'temp') + utils.path.create_path(temp_dir) training_tool = eval_job["options"].get("training_tool", "gaia") @@ -104,12 +105,6 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): ) logging.info(e) - finally: - # Clean up the source files used to generate this model. - # We can recreate them from the database if we need them - # at a later stage. - shutil.rmtree(temp_dir) - def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, storage_dir, eval_job): results = gaia_wrapper.train_model( From 82c9f1200c80ee55531b3040d02e16fe32bcf106 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Thu, 1 Jul 2021 00:12:37 +0530 Subject: [PATCH 39/64] Use yaml.safe_dump --- dataset_eval/evaluate.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 38af1033d..ee8a6a71e 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -71,15 +71,14 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): filelist_path = os.path.join(eval_location, "filelist.yaml") filelist = dump_lowlevel_data(train.keys(), temp_dir) with open(filelist_path, "w") as f: - yaml.dump(filelist, f) + yaml.safe_dump(filelist, f) elif training_tool == "sklearn": dump_lowlevel_data_sklearn(train.keys(), dataset_dir) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") with open(groundtruth_path, "w") as f: - # yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) - yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f, Dumper=yaml.SafeDumper) + yaml.safe_dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) if training_tool == "gaia": logging.info("Training GAIA model...") @@ -250,7 +249,7 @@ def lowlevel_data_to_yaml(data): if 'lossless' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['lossless'] - return yaml.dump(data) + return yaml.safe_dump(data) def dump_lowlevel_data_sklearn(recordings, location): From edf3d4f5f149ceb08ed4df333d77400e1a615e99 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 14:40:31 +0530 Subject: [PATCH 40/64] Store sklearn accuracy on 0 to 100 scale --- dataset_eval/evaluate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index ee8a6a71e..eed1f0c42 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -183,7 +183,8 @@ def load_best_results_sklearn(exported_path, project_file, exports_directory): return { "parameters": data_best_model["params"], - "accuracy": round(data_best_model["score"], 2), + # for consistency with gaia which reports accuracy on scale of 0 to 100 + "accuracy": round(data_best_model["score"] * 100, 2), "confusion_matrix": simplified_cm, "history_path": "Does not exist because of sklearn training usage" } From c09b0c7ef32aa5fabcfc141954c8e4b9fc3b0d2b Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Mon, 5 Jul 2021 15:39:11 +0530 Subject: [PATCH 41/64] Display training_tool in evaluation jobs list --- webserver/static/scripts/datasets/eval-jobs-viewer.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/webserver/static/scripts/datasets/eval-jobs-viewer.js b/webserver/static/scripts/datasets/eval-jobs-viewer.js index 3e89c52bd..03411d5c8 100644 --- a/webserver/static/scripts/datasets/eval-jobs-viewer.js +++ b/webserver/static/scripts/datasets/eval-jobs-viewer.js @@ -191,6 +191,7 @@ class JobList extends React.Component { id={cls.id} created={cls.created} status={cls.status} + training_tool={cls.options.training_tool ?? "gaia"} outdated={cls.outdated} showDelete={this.props.showDelete} onViewDetails={this.props.onViewDetails} @@ -227,6 +228,7 @@ class JobRow extends React.Component { id: PropTypes.string.isRequired, created: PropTypes.string.isRequired, status: PropTypes.string.isRequired, + training_tool: PropTypes.string.isRequired, outdated: PropTypes.string.isRequired, showDelete: PropTypes.bool.isRequired, onViewDetails: PropTypes.func.isRequired, @@ -283,6 +285,7 @@ class JobRow extends React.Component {
{this.props.created} {this.props.training_tool} {controls}
{this.props.created} {this.props.training_tool}{this.props.training_tool} {controls}
Job ID Status Creation timeTraining Tool
' - html += '' - html += '' - html += '' - html += '' - html += '' - html += '' - html += '' - html += '' - html += '

Predicted (%)

' - html += '' - - html += '' - - labels = sorted(self.classes()) - for predicted in labels: - html += '' - - html += '' - html += '' - html += '' - - for actual in labels: - html += '' - html += '' - - classInstances = 0 - for predicted in self.matrix[actual].values(): - classInstances += len(predicted) - - proportion = 100.0 * classInstances / self.total() - - for predicted in labels: - correct = len(self.matrix[actual][predicted]) - if classInstances: - percentage = correct * 100.0 / classInstances - else: - percentage = 0 - - if actual == predicted: - if percentage > 0: - html += '' % (percentage, correct, actual, classInstances, predicted) - html += '' - html += '" - html += '' - html += '
'+predicted+'Proportion
' + actual + '' - else: - html += '' - else: - if percentage >= 10: # 10%, hard-coded value - html += '' - else: - html += '' - - html += '%.2f %d %s (out of %d) classified as %s' + actual + '' + "%.2f" % proportion + " %

Actual (%)

' - html += '
' - - - if standAlone: - if embedStyleSheet: - html = ''' - - - ''' + html + '' - else: - html = ''' - - - ''' + html + '' - - return html + return matrix diff --git a/acousticbrainz/models/sklearn/classification/matrix_creation.py b/acousticbrainz/models/sklearn/classification/matrix_creation.py index 8732a9f9f..536acb5f4 100644 --- a/acousticbrainz/models/sklearn/classification/matrix_creation.py +++ b/acousticbrainz/models/sklearn/classification/matrix_creation.py @@ -1,7 +1,7 @@ import os import json import numpy as np -from ..classification.confusion_matrix_export import ConfusionMatrixCreation +from ..classification.confusion_matrix_export import load_as_confusion_matrix def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, export_name): @@ -49,14 +49,13 @@ def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, def simplified_matrix_export(best_result_file, logger, export_save_path, export_name, write_mode=False): - cm = ConfusionMatrixCreation() load_file_path = os.path.join(export_save_path, best_result_file) # best model data load from JSON logger.info("load best model results from JSON format file") - cm.load(load_file_path) + confusion_matrix = load_as_confusion_matrix(load_file_path) logger.info("Best model results loaded..") simplified_cm = {} - for key, val in cm.matrix.items(): + for key, val in confusion_matrix.items(): simplified_cm[key] = {} for predicted_key, predicted_val in val.items(): simplified_cm[key][predicted_key] = len(predicted_val) From 98a5dfeab308e38e1146af029934c367a295a3a3 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Thu, 8 Jul 2021 00:00:20 +0530 Subject: [PATCH 63/64] Remove unneccessary default dicts from matrix creation We never really access these dicts other than iterating by calling `.items()` so the defaultdict are redundant and only add confusion. --- .../classification/confusion_matrix_export.py | 14 -------------- .../sklearn/classification/matrix_creation.py | 7 ++++--- 2 files changed, 4 insertions(+), 17 deletions(-) delete mode 100644 acousticbrainz/models/sklearn/classification/confusion_matrix_export.py diff --git a/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py b/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py deleted file mode 100644 index aa382c2ae..000000000 --- a/acousticbrainz/models/sklearn/classification/confusion_matrix_export.py +++ /dev/null @@ -1,14 +0,0 @@ -# encoding: utf-8 -from collections import defaultdict -import json - -def load_as_confusion_matrix(filename): - with open(filename) as f: - data = json.load(f) - - # convert to a defaultdict the data we just loaded - matrix = defaultdict(lambda: defaultdict(list)) - for k, v in data['matrix'].items(): - matrix[k] = defaultdict(list, v) - - return matrix diff --git a/acousticbrainz/models/sklearn/classification/matrix_creation.py b/acousticbrainz/models/sklearn/classification/matrix_creation.py index 536acb5f4..2c6f1e71f 100644 --- a/acousticbrainz/models/sklearn/classification/matrix_creation.py +++ b/acousticbrainz/models/sklearn/classification/matrix_creation.py @@ -1,7 +1,6 @@ import os import json import numpy as np -from ..classification.confusion_matrix_export import load_as_confusion_matrix def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, export_name): @@ -50,10 +49,12 @@ def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, def simplified_matrix_export(best_result_file, logger, export_save_path, export_name, write_mode=False): load_file_path = os.path.join(export_save_path, best_result_file) - # best model data load from JSON logger.info("load best model results from JSON format file") - confusion_matrix = load_as_confusion_matrix(load_file_path) + with open(load_file_path) as f: + data = json.load(f) + confusion_matrix = data['matrix'] logger.info("Best model results loaded..") + simplified_cm = {} for key, val in confusion_matrix.items(): simplified_cm[key] = {} From ade079587c4ee9019d5b95f338d7a2d4dbb8e742 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Thu, 8 Jul 2021 16:57:35 +0530 Subject: [PATCH 64/64] Save best model for sklearn to result->>'model' --- dataset_eval/evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 0934afa3f..d08258d34 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -144,7 +144,7 @@ def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, sto "parameters": results["parameters"], "accuracy": results["accuracy"], "confusion_matrix": results["confusion_matrix"], - "history_path": results["history_path"], + "model": results["model"], })) @@ -185,7 +185,7 @@ def load_best_results_sklearn(exported_path, project_file): # for consistency with gaia which reports accuracy on scale of 0 to 100 "accuracy": round(data_best_model["score"] * 100, 2), "confusion_matrix": simplified_cm, - "history_path": "Does not exist because of sklearn training usage" + "model": os.path.join(exported_path, "best_clf_model.pkl") # path to best model pickle file }