diff --git a/Dockerfile.py3 b/Dockerfile.py3 new file mode 100644 index 000000000..7e6eaa9af --- /dev/null +++ b/Dockerfile.py3 @@ -0,0 +1,119 @@ +FROM metabrainz/python:3.7 AS acousticbrainz-sklearn + +# Dockerize +ENV DOCKERIZE_VERSION v0.6.1 +RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \ + && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz + +# Install dependencies +# Hadolint DL4006 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] +# Node +RUN wget -q -O - https://deb.nodesource.com/setup_12.x | bash - && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + git \ + ipython \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libavresample-dev \ + libffi-dev \ + libfftw3-dev \ + libpq-dev \ + libsamplerate0-dev \ + libqt4-dev \ + libssl-dev \ + libtag1-dev \ + libxml2-dev \ + libxslt1-dev \ + libyaml-dev \ + nodejs \ + pkg-config \ + pxz \ + python-dev \ + python-numpy-dev \ + python-numpy \ + swig2.0 \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir /code +RUN mkdir /data +WORKDIR /code + +RUN groupadd --gid 901 acousticbrainz +RUN useradd --create-home --shell /bin/bash --uid 901 --gid 901 acousticbrainz + +RUN chown acousticbrainz:acousticbrainz /code + +# Python dependencies +RUN mkdir /code/docs/ && chown acousticbrainz:acousticbrainz /code/docs/ +COPY --chown=acousticbrainz:acousticbrainz docs/requirements.txt /code/docs/requirements.txt +COPY --chown=acousticbrainz:acousticbrainz requirements.txt /code/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Python dependencies for sklearn +COPY --chown=acousticbrainz:acousticbrainz acousticbrainz/models/sklearn/requirements.txt /code/acousticbrainz/models/sklearn/requirements.txt +RUN pip install --no-cache-dir -r /code/acousticbrainz/models/sklearn/requirements.txt + + +FROM acousticbrainz-sklearn AS acousticbrainz-dev + +COPY --chown=acousticbrainz:acousticbrainz requirements_development.txt /code/requirements_development.txt +RUN pip install --no-cache-dir -r requirements_development.txt + + +# We don't copy code to the dev image because it's added with a volume mount +# during development, however it's needed for tests. Add it here. +FROM acousticbrainz-dev AS acousticbrainz-test + +COPY . /code + + +FROM acousticbrainz-sklearn AS acousticbrainz-prod +USER root + +RUN pip install --no-cache-dir uWSGI==2.0.17.1 + +RUN mkdir /cache_namespaces && chown -R acousticbrainz:acousticbrainz /cache_namespaces + +# Consul template service is already set up, just need to copy the configuration +COPY ./docker/consul-template.conf /etc/consul-template.conf + +# runit service files +# All services are created with a `down` file, preventing them from starting +# rc.local removes the down file for the specific service we want to run in a container +# http://smarden.org/runit/runsv.8.html + +# uwsgi service files +COPY ./docker/uwsgi/uwsgi.service /etc/service/uwsgi/run +COPY ./docker/uwsgi/uwsgi.ini /etc/uwsgi/uwsgi.ini +RUN touch /etc/service/uwsgi/down + +# hl_extractor service files +COPY ./docker/hl_extractor/hl_extractor.service /etc/service/hl_extractor/run +RUN touch /etc/service/hl_extractor/down + +# dataset evaluator service files +COPY ./docker/dataset_eval/dataset_eval.service /etc/service/dataset_eval/run +RUN touch /etc/service/dataset_eval/down + +# Add cron jobs +COPY docker/crontab /etc/cron.d/acousticbrainz +RUN chmod 0644 /etc/cron.d/acousticbrainz +RUN touch /etc/service/cron/down + +COPY ./docker/rc.local /etc/rc.local + +COPY --chown=acousticbrainz:acousticbrainz package.json /code + +USER acousticbrainz +RUN npm install + +COPY --chown=acousticbrainz:acousticbrainz . /code + +RUN npm run build:prod + +# Our entrypoint runs as root +USER root diff --git a/acousticbrainz/__init__.py b/acousticbrainz/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/__init__.py b/acousticbrainz/models/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/README.md b/acousticbrainz/models/sklearn/README.md new file mode 100644 index 000000000..3287d075c --- /dev/null +++ b/acousticbrainz/models/sklearn/README.md @@ -0,0 +1,243 @@ +# Machine Learning Infrastructure with scikit-learn (GSoC 2020) + +This folder contains the tool that is built for training SVM models of +AcousticBrainz's datasets, as well as predicting where a single AcousticBrainz +track instance can be classified based on the trained models. It is part of the +*Google Summer of Code 2020* in collaboration with the **MetaBrainz** Open-Source +organization. + +Given a dataset, a Grid Search algorithm using n-fold cross-validation is executed +for an exhaustive search over specified parameter values for an estimator. + +A final model is trained with all the data (without a validation set) featuring +the best parameter combination in terms of accuracy. + +Finally, a prediction functionality is part of the tool, which gives the user the +capability of predicting where a track instance is classified based on a trained model. + + +## Functionalities + +### Train +The main model training function is the `create_classification_project` which is located in +the `model.classification_project.py` Python script. It can be imported as a module. +It requires a path to the dataset directory that contains sub-folders +composed of the groundtruth yaml file/s (tracks, tracks paths, labels, target class), and +the features (low-level data) in JSON format. + +``` +create_classification_project() + +Generates a model trained using descriptor files specified in the groundtruth yaml file. + +positional parameters: +groundtruth Path of the main dataset directory containing the + groundtruth yaml file/s. (required) + +file Name of the project configuration file (.yaml) will be stored. + If not specified it takes automatically the name ." + +exportsdir Name of the exports directory that the project's results + will be stored (best model, grid models, transformation + pipelines, folded and shuffled dataset). + +path Path where the project results will be stored. If empty, + the results will be saved in the main app directory. + +optional parameters: + +c_values The C values parameter (list) for the SVM Grid Search + (e.g. [-2, 3, 5, 10]). In case of None, the values will be set up + by the specified in the configuration template. + +gamma_values The gamma values parameter (list) for the SVM Grid Search + (e.g. [ 3, 1, -1, -3]). In case of None, the values will be set up + by the specified in the configuration template. + +preprocessing_values: The preprocessing values parameter (list) for the + SVM Grid Search. They must be one or more of the following list: + ["basic", "lowlevel", "nobands", "normalized", "gaussianized"] + In case of None, the values will be set up + by the specified in the configuration template. + +logging The logging level (int) that will be printed (0: DEBUG, 1: INFO, + 2: WARNING, 3: ERROR, 4: CRITICAL). Can be set only in the + prescribed integer values (0, 1, 2, 3, 4) + +seed Seed (int) is used to generate the random shuffled dataset + applied later to folding. If no seed is specified, the seed + will be automatically set to current clock value. + +jobs Parallel jobs (int). Set a value of cores to be used. + The default is -1, which means that all the available cores + will be used. + +verbose Controls the verbosity (int) of the Grid Search print messages + on the console: the higher, the more messages. +``` + +For example, a dataset path directory structure could be like this one: + + dataset (e.g. danceability) + |- features + |  |-happy + |  |  |- 1.json + |  |  |- 2.json + |  |  |- 3.json + |  |  |- 4.json + |  |-sad + |  |  |- 1.json + |  |  |- 2.json + |  |  |- 3.json + |- metadata + |  |- groundtruth.yaml + +The tool will train a model with 2 classes (happy, sad), with 4 and 3 files in each class, respectively. + +The tool generates a `.yaml` project file to the path and exports directory specified or by the +arguments or automatically by the tool itself. This project file contains information about the +preprocessing steps that are followed through the training process, as well as the path and directory +where the results after the model training will be stored to. + + +### How the Training mode works + +There are several steps which are followed in the training phase. First of all, the project +configuration template file is loaded. Then, based on the arguments that are specified via the +`create_classification_project` function invoke, the`ListGroundTruthFiles` class searches for +the available `.yaml` file/s which contain the target class and the *groundtruth* data. These files +are inside the specified dataset directory. + +Afterwards, for each target class, the following actions take place inside the +`train_class` function: + +1. It starts with the `GroundTruthLoad` class that loads the *groundtruth* data from the related `.yaml` file. By + using its included methods, the tracks with their labels shuffled, in tuples, are exported as well as the + target class exploiting the `export_gt_tracks()` and the `export_train_class()` accordingly. The shuffled + dataset is also exported and saved locally in `.csv` format. A logger object is also set up and the logging + results are exported into the relevant `.log` file. + +2. It creates a project configuration file based on the specified paths for the exported results, as well as + a relevant directory that these results will be stored to. The training model results comprise: + +3. The `DatasetExporter` class is used then to load the tracks' features and exports them in a `pandas DataFrame`. + The tracks and the labels are also exported in separate `NumPy arrays` too. + +4. The `ClassificationTaskManager` class is invoked which is used for extracting the different classification tasks + that are specified in the configuration file. This is done be calling the `TrainingProcesses` class, which reads + the configuration file, and extracts the available training processes in a list. Each item of the list is + composed of a Python dictionary that comprises the evaluation that will take place with its: a) the classifier used, + b) the preprocess steps (features selection, scaling type, etc.), the k-fold cross-validation (number of folds), + and finally, c) the combination parameters that a Grid Search algorithm will use to find the best model that will + be assigned to the classifier. + +5. For each evaluation, the `ClassificationTask` class is used. The class loads the list of process dictionaries, with + their corresponding training steps as described above that contain also the features with their labels, as well as + the specified in the configuration file classifier that will be used for training the model. + +6. The whole specified classification task (i.e. the preprocessing, the training of the model for the selected + features, and the evaluation) takes place inside the `ClassificationTask` class. The `TrainGridClassifier` is + responsible for the classifier training by using a Grid Search algorithm which, in our case loads a + `Support Vector Machines` Machine Learning model from sklearn with a grid of parameters. + +7. For each preprocessing step, the `Transform` class is responsible for doing the appropriate preprocess, like the + data cleaning, the features selection, the enumeration, and the scaling, when it is available. For each + preprocessing step, the corresponding transformation pipeline (in `.pkl` format) is extracted and saved locally + for later use in the predictions mode. + +8. The transformed features data is loaded then to the `train_grid` function where the training of the model takes place. + The results of the training phase are extracted by using the `save_grid_results` function. Such results are the best + parameters that did best in each training phase (i.e. in each training step), as well as the best model from this + training step which is saved locally in `.pkl` format. Finally, the best extracted + models from each training process are compared and the best one is chosen. The information about the best model + parameters, with the preprocess step that was followed are exported and saved in a `.json` file locally, and + include: + * Best model's score, the parameters, the preprocess (data cleaning, features selection, enumeration, scaling), + and the number of folds that the dataset was split into through the cross-validation training procedure. + +9. The `evaluation` function is used to evaluate the best model and the relevant reports are + exported. The best model and the corresponding preprocessing step pipeline are loaded, and a k-fold + cross-validation training takes place. The results from this process are: + * A `yaml` file that contains the tracks' instances and the fold that were classified is exported in this phase. + * A `.csv` file that includes the tracks, the prediction that took place in the relevant fold, the true label, + and the probability of the classifier's decision function that took for each class prediction. + * The plot that depicts the accuracy score delivered from each fold training. + * A `.txt` file that contains detailed information about each fold's training score, the *mean* of all the + accuracies exported from each fold, as well as the *standard deviation* of these accuracies. + * The `.txt` files that contain the confusion matrix and the classification report of the cross-validation + training. + +10. Finally, the `evaluation` function executes a training to the whole dataset by using the best model that is + extracted from the grid search algorithm. After applying predictions to the whole dataset, the related `.txt` + files with the confusion matrix and the classification report are exported and saved locally to the disk. The + trained model, after this training phase is saved locally in `.pkl` format for later use from the + predictions mode of the tool. + + + +### Predict + +The `model.predict.py` script contains the `prediction` function. This function can be invoked via by +importing the function in a separate script and invoking it with its corresponding parameters. The +project `.yaml` file with project's configuration metadata is a required field in the function's +parameters, as well as the **MBID** of the track to be called for predicting to which trained model's +class will be classified. The MBID is actually the Musicbrainz ID which is the unique track's ID +stored in the MusicBrainz and AcousticBrainz database. For example, the following link: +* https://acousticbrainz.org/232b8e6e-0aa5-4310-8df3-583047af3126 +has the MBID: `232b8e6e-0aa5-4310-8df3-583047af3126` + +This is the only necessary information for the related argument of the `prediction` function to +make the relevant classification. + +``` +$ python predict.py --help +usage: predict.py [-h] [--path] [--file] [--track] [--logging] + +positional arguments: +path Path where the project file (.yaml) is stored (required). + +file Name of the project configuration file (.yaml) that + is to be loaded. (required) + The .yaml at the end of the file is not necessary. + Just put the name of the file. + +track MBID of the the low-level data from the AcousticBrainz API. + (required) + +optional arguments: + +logging The logging level (int) that will be printed (0: DEBUG, 1: INFO, + 2: WARNING, 3: ERROR, 4: CRITICAL). Can be set only in the + prescribed integer values (0, 1, 2, 3, 4) +``` + +### How the Predictions mode works + +The function and the class that are used in this phase are the `prediction` and the `Predict` accordingly. The steps +that are followed in this mode are: + +1. The `prediction` function loads the project configuration file that was created by the training of the + corresponding model. This `.yaml` file includes all the relevant information about the paths that the + trained model and the preprocessing pipelines were saved to (in `.pkl` format). + +2. Then, by using the MBID that was inserted as an argument, it downloads the low-level data from AcousticBrainz API, + using the `requests` library. + +3. The data, which are in JSON format are then loaded to the `Predict` class, with the built model's configuration + data (training results' location, etc.). + +3. The `Predict` loads the best model's JSON file that was saved from the training mode, and checks the preprocessing + step that resulted in the best model. + +4. After checking which was the preprocessing step that was specified inside the best model's metadata, the + `TransformPredictions` class is invoked and does the necessary data transformation by loading the corresponding + preprocessing pipeline that was saved in `.pkl` format during the training mode. + +5. After that, it loads the best trained model that was saved in `.pkl` format. + +6. It does the prediction. + +7. It returns a dictionary that includes: + * the predicted class + * the score of the predicted class + * the probabilities for each class the model took to decide to which one the track will be classified. diff --git a/acousticbrainz/models/sklearn/__init__.py b/acousticbrainz/models/sklearn/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/classification/__init__.py b/acousticbrainz/models/sklearn/classification/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/classification/classification_task.py b/acousticbrainz/models/sklearn/classification/classification_task.py new file mode 100644 index 000000000..773281bf0 --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/classification_task.py @@ -0,0 +1,79 @@ +import os +import json +from ..classification.classifier_grid import TrainGridClassifier +from ..classification.evaluation import evaluation + + +class ClassificationTask: + """ + This class is the core of the model classification. It loads the relevant classifier to + be used for training, the features, the labels, and the tracks. It uses a corresponding + to the configuration file declared class to train the model and then it uses that model + for evaluation. + """ + def __init__(self, config, classifier, train_class, training_processes, X, y, exports_path, tracks, logger): + """ + Args: + config: The configuration data that contain the settings from the configuration + template with the parsed arguments in classification project. + classifier: The classifier name (e.g. svm) that is declared in the classifiers + list of the configuration data. + train_class: The class name that is defined in the groundtruth yaml file. It is + actually the model that will be trained. + training_processes: The training processes (list) where each item of the list + contains the set of parameters that will be used in the classifier: + (Evaluation, classifier, preprocess, kernel, C, gamma, balanceClasses, n_fold) + X: The features (pandas DataFrame) of the exported data from the DatasetExporter class + y: The labels (NumPy array) of the target class + exports_path: Path to where the classification project's results will be stored to. + tracks: The tracks (numpy.ndarray) that are exported from the Groundtruth file. + log_level: The logging level (0-4). + """ + self.config = config + self.classifier = classifier + self.train_class = train_class + + self.X = X + self.y = y + self.training_processes = training_processes + self.exports_path = exports_path + self.tracks = tracks + self.logger = logger + + + def run(self): + # grid search train + if self.config["train_kind"] == "grid": + self.logger.info("Train Classifier: Classifier with GridSearchCV") + grid_svm_train = TrainGridClassifier(config=self.config, + classifier=self.classifier, + class_name=self.train_class, + X=self.X, + y=self.y, + tr_processes=self.training_processes, + exports_path=self.exports_path, + logger=self.logger + ) + grid_svm_train.train_grid_search_clf() + grid_svm_train.export_best_classifier() + else: + self.logger.error("Use a valid classifier in the configuration file.") + self.logger.info("Training the classifier is completed successfully.") + + # load best model to check its parameters + self.logger.debug("Loading the Best Model..") + best_model_name = "best_model_{}.json".format(self.train_class) + with open(os.path.join(self.exports_path, best_model_name)) as best_model_file: + best_model = json.load(best_model_file) + self.logger.debug("BEST MODEL: {}".format(best_model)) + + # evaluation + evaluation(config=self.config, + n_fold=best_model["n_fold"], + X=self.X, y=self.y, + class_name=self.train_class, + tracks=self.tracks, + process=best_model["preprocessing"], + exports_path=self.exports_path, + logger=self.logger + ) diff --git a/acousticbrainz/models/sklearn/classification/classification_task_manager.py b/acousticbrainz/models/sklearn/classification/classification_task_manager.py new file mode 100644 index 000000000..0248c621b --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/classification_task_manager.py @@ -0,0 +1,128 @@ +import os +from time import time +from termcolor import colored +from datetime import datetime + +from ..helper_functions.utils import create_directory, extract_training_processes +from ..classification.classification_task import ClassificationTask + + +validClassifiers = ["svm", "NN"] +validEvaluations = ["nfoldcrossvalidation"] + + +class ClassificationTaskManager: + """ + It manages the tasks to be done based on the configuration file. It checks if the + config keys exist in the template and are specified correctly, as well as it creates + the relevant directories (if not exist) where the classification results will be + stored to. Then, it extracts a list with the evaluation steps that will be followed + with their corresponding preprocessing steps and parameters declaration for the + classifier, and executes the classification task for each step. + """ + def __init__(self, config, train_class, X, y, tracks, exports_path, logger): + """ + Args: + config: The configuration file name. + train_class: The class that will be trained. + X: The already shuffled data that contain the features. + y: The already shuffled data that contain the labels. + """ + self.config = config + self.train_class = train_class + self.X = X + self.y = y + self.tracks = tracks + self.exports_path = exports_path + self.logger = logger + + self.results_path = "" + self.logs_path = "" + self.tracks_path = "" + self.dataset_path = "" + self.models_path = "" + self.images_path = "" + self.reports_path = "" + + self.files_existence() + self.config_file_analysis() + + + def files_existence(self): + """ + Ensure that all the folders will exist before the training process starts. + """ + # main exports + # train results exports + self.results_path = create_directory(self.exports_path, "results") + # logs + self.logs_path = create_directory(self.exports_path, "logs") + # tracks + self.tracks_path = create_directory(self.exports_path, "tracks_csv_format") + # datasets + self.dataset_path = create_directory(self.exports_path, "dataset") + # models + self.models_path = create_directory(self.exports_path, "models") + # images + self.images_path = create_directory(self.exports_path, "images") + # reports + self.reports_path = create_directory(self.exports_path, "reports") + + def config_file_analysis(self): + """ + Check the keys of the configuration template file if they are set up correctly. + """ + self.logger.info("---- CHECK FOR INAPPROPRIATE CONFIG FILE FORMAT ----") + if "processing" not in self.config: + self.logger.error("No preprocessing defined in config.") + + if "evaluations" not in self.config: + self.logger.error("No evaluations defined in config.") + self.logger.error("Setting default evaluation to 10-fold cross-validation") + self.config["evaluations"] = {"nfoldcrossvalidation": [{"nfold": [10]}]} + + for classifier in self.config['classifiers'].keys(): + if classifier not in validClassifiers: + self.logger.error("Not a valid classifier: {}".format(classifier)) + raise ValueError("The classifier name must be valid.") + + for evaluation in self.config['evaluations'].keys(): + if evaluation not in validEvaluations: + self.logger.error("Not a valid evaluation: {}".format(evaluation)) + raise ValueError("The evaluation must be valid.") + self.logger.info("No errors in config file format found.") + + def apply_processing(self): + """ + Evaluation steps extraction and classification task execution for each step. + """ + start_time = time() + training_processes = extract_training_processes(self.config) + self.logger.info("Classifiers detected: {}".format(self.config["classifiers"].keys())) + for classifier in self.config["classifiers"].keys(): + print("Before Classification task: ", classifier) + task = ClassificationTask(config=self.config, + classifier=classifier, + train_class=self.train_class, + training_processes=training_processes, + X=self.X, + y=self.y, + exports_path=self.exports_path, + tracks=self.tracks, + logger=self.logger + ) + try: + task.run() + except Exception as e: + self.logger.error('Running task failed: {}'.format(e)) + print(colored('Running task failed: {}'.format(e), "red")) + end_time = time() + + print() + print(colored("Last evaluation took place at: {}".format(datetime.now()), "magenta")) + self.logger.info("Last evaluation took place at: {}".format(datetime.now())) + + # test duration + time_duration = end_time - start_time + classification_time = round(time_duration / 60, 2) + return classification_time diff --git a/acousticbrainz/models/sklearn/classification/classifier_grid.py b/acousticbrainz/models/sklearn/classification/classifier_grid.py new file mode 100644 index 000000000..5c36ded43 --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/classifier_grid.py @@ -0,0 +1,161 @@ +import os +import json +from termcolor import colored +import joblib +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +from sklearn.model_selection import KFold + +from ..transformation.transform import Transform + + +class TrainGridClassifier: + def __init__(self, config, classifier, class_name, X, y, tr_processes, exports_path, logger): + self.config = config + self.classifier = classifier + self.class_name = class_name + self.X = X + self.y = y + self.tr_processes = tr_processes + self.exports_path = exports_path + + self.logger = logger + self.best_models_list = [] + # self.train_grid_search_clf() + + + def train_grid_search_clf(self): + process_counter = 1 + for tr_process in self.tr_processes: + print(colored("Train process {} - {}".format(process_counter, tr_process), "green")) + self.logger.info("(Grid) - Train process {} - {}".format(process_counter, tr_process)) + # initiate SVM classifier object + if self.classifier == "svm": + grid_clf = SVC(gamma="auto", probability=True) + # TODO: different classifier object (e.g. random forests, knn, etc) can be initiated here + else: + raise ValueError('The classifier name must be valid.') + + print("CLASSIFIER", tr_process["classifier"]) + # transformation of the data + features_prepared = Transform(config=self.config, + df_feats=self.X, + process=tr_process["preprocess"], + train_class=self.class_name, + exports_path=self.exports_path, + logger=self.logger).post_processing() + + # train the grid classifier and return the trained model + gsvc = train_grid(tr_process=tr_process, + grid_clf=grid_clf, + features_prepared=features_prepared, + y=self.y, + config=self.config, + logger=self.logger) + + # save best results for each train process + # paths declaration for saving the grid training results + results_path = os.path.join(self.exports_path, "results") + models_path = os.path.join(self.exports_path, "models") + best_process_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(tr_process["preprocess"])) + + # save the results from each train process step and return the results from that train in a dictionary + # that contains: the best score, the best params, the number of folds, and the preprocessing step + results_dict = save_grid_results(gsvc=gsvc, + class_name=self.class_name, + tr_process=tr_process, + results_path=results_path, + best_process_model_path=best_process_model_path, + logger=self.logger) + + # return a list that includes the best models exported from each processing + self.best_models_list.append(results_dict) + + print(colored("Next train process..", "yellow")) + process_counter += 1 + print() + print() + print(colored("Finishing training processes..", "blue")) + print() + + def export_best_classifier(self): + # Gather the best scores from the exported grid clf models + scores = [x["score"] for x in self.best_models_list] + self.logger.info("This is the max score of all the training processes: {}".format(max(scores))) + for model in self.best_models_list: + if model["score"] == max(scores): + self.logger.info("Best {} model parameters:".format(self.class_name)) + # log2 --> convert values to initial parameters' values + # model["params"]["C"] = math.log2(model["params"]["C"]) + # model["params"]["gamma"] = math.log2(model["params"]["gamma"]) + self.logger.info("{}".format(model)) + best_model_name = "best_model_{}.json".format(self.class_name) + with open(os.path.join(self.exports_path, best_model_name), "w") as best_model: + json.dump(model, best_model, indent=4) + self.logger.info("Best {} model parameters saved successfully to disk.".format(self.class_name)) + + +def train_grid(tr_process, grid_clf, features_prepared, y, config, logger): + # define the length of parameters + parameters_grid = {'kernel': tr_process["kernel"], + 'C': tr_process["C"], + 'gamma': tr_process["gamma"], + 'class_weight': tr_process["balance_classes"] + } + + # inner with K-Fold cross-validation declaration + random_seed = None + shuffle = config["k_fold_shuffle"] + if shuffle is True: + random_seed = config["seed"] + elif shuffle is False: + random_seed = None + logger.info("Fitting the data to the classifier with K-Fold cross-validation..") + inner_cv = KFold(n_splits=tr_process["n_fold"], + shuffle=shuffle, + random_state=random_seed + ) + # initiate GridSearch Object + gsvc = GridSearchCV(estimator=grid_clf, + param_grid=parameters_grid, + cv=inner_cv, + n_jobs=config["parallel_jobs"], + verbose=config["verbose"] + ) + + logger.debug("Shape of X before train: {}".format(features_prepared.shape)) + logger.info("Fitting the data to the model..") + gsvc.fit(features_prepared, y) + + logger.info("Results from each best preprocess training:") + logger.info("a) Best score: {}".format(gsvc.best_score_)) + logger.info("b) Best estimator: {}".format(gsvc.best_estimator_)) + logger.info("c) Best parameters: {}".format(gsvc.best_params_)) + logger.info("Counted evaluations in this GridSearch process: {}".format(len(gsvc.cv_results_["params"]))) + + return gsvc + + +def save_grid_results(gsvc, class_name, tr_process, results_path, best_process_model_path, logger): + results_best_dict_name = "result_{}_{}_best_{}.json" \ + .format(class_name, tr_process["preprocess"], gsvc.best_score_) + + results_dict = { + "score": gsvc.best_score_, + "params": gsvc.best_params_, + "n_fold": tr_process['n_fold'], + "preprocessing": tr_process["preprocess"] + } + with open(os.path.join(results_path, results_best_dict_name), 'w') as grid_best_json: + json.dump(results_dict, grid_best_json, indent=4) + + # export the parameters that the best model has from each training step + results_params_dict_name = "result_{}_{}_params_{}.json" \ + .format(class_name, tr_process["preprocess"], gsvc.best_score_) + with open(os.path.join(results_path, results_params_dict_name), 'w') as grid_params_json: + json.dump(gsvc.cv_results_["params"], grid_params_json, indent=0) + + joblib.dump(gsvc.best_estimator_, best_process_model_path) + logger.info("Grid Best model for the {} process saved.".format(tr_process["preprocess"])) + + return results_dict diff --git a/acousticbrainz/models/sklearn/classification/evaluation.py b/acousticbrainz/models/sklearn/classification/evaluation.py new file mode 100644 index 000000000..a26f5bf88 --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/evaluation.py @@ -0,0 +1,428 @@ +import os +import json +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from termcolor import colored +import yaml +from sklearn.model_selection import KFold +from sklearn.metrics import accuracy_score +from sklearn.metrics import confusion_matrix, classification_report +import joblib + +from ..transformation.transform import Transform +from ..classification.report_files_export import export_report +from ..classification.matrix_creation import matrix_creation, simplified_matrix_export + + +def evaluation(config, n_fold, X, y, class_name, tracks, process, exports_path, logger): + print(colored("------ EVALUATION and FOLDING ------", "yellow")) + + logger.info("---- Folded evaluation of the model in the dataset ----") + logger.info("number of folds set to config: {}".format(n_fold)) + logger.debug("Sample of shuffled tracks tracks:") + logger.debug("{}".format(tracks[:5])) + logger.debug("Tracks list length: {}".format(len(tracks))) + + # load project directory and the corresponding save paths + + dataset_path = os.path.join(exports_path, "dataset") + models_path = os.path.join(exports_path, "models") + images_path = os.path.join(exports_path, "images") + + # load best model params and score data + load_best_model_params_score_path = os.path.join(exports_path, "best_model_{}.json".format(class_name)) + with open(load_best_model_params_score_path) as model_params_score_file: + best_params_score_data = json.load(model_params_score_file) + + logger.info("Best model preprocessing step: {}".format(process)) + # load the saved classifier + clf = joblib.load(os.path.join(models_path, "model_grid_{}.pkl".format(process))) + logger.info("Best model loaded.") + + # inner K-Fold cross-validation declaration + random_seed = None + shuffle = config["k_fold_shuffle"] + if shuffle is True: + random_seed = config["seed"] + elif shuffle is False: + random_seed = None + logger.info("Fitting the data to the classifier with K-Fold cross-validation..") + inner_cv = KFold(n_splits=n_fold, + shuffle=shuffle, + random_state=random_seed) + + # transformation of the data to proper features based on the preprocess step + features_prepared = Transform(config=config, + df_feats=X, + process=process, + train_class=class_name, + exports_path=exports_path, + logger=logger).post_processing() + logger.debug("Features prepared shape: {}".format(features_prepared.shape)) + + # Starting Training, Predictions for each fold + logger.info("Starting fold-evaluation..") + predictions_df_list, accuracy_model, tracks_fold_indexing_dict = predictions_fold(clf=clf, + inner_cv=inner_cv, + feats_prepared=features_prepared, + y=y, + tracks=tracks, + class_name=class_name, + logger=logger) + + # concatenate the folded predictions DFs + df_predictions = create_dataset_predictions(list_df_predictions=predictions_df_list, + class_name=class_name, + dataset_path=dataset_path, + logger=logger) + + logger.debug("PRINT THE WHOLE GESTURES DF:\n{}".format(df_predictions)) + + # list of each column from the dataframe for the folded indexed tracks, y, adn predictions + tracks_folded_list = df_predictions["track"].to_list() + y_folded_list = df_predictions[class_name].to_list() + pred_folded_list = df_predictions["predictions"].to_list() + + # export the matrix dictionary from the folded dataset + folded_matrix_dict = matrix_creation(classes=clf.classes_, + tracks=tracks_folded_list, + y_actual=y_folded_list, + y_hat=pred_folded_list, + logger=logger, + export_save_path=exports_path, + export_name="folded_dataset_results_matrix.json") + + # ACCURACIES for each fold + export_accuracies(accuracy_model=accuracy_model, + config=config, + class_name=class_name, + exports_path=exports_path, + images_path=images_path, + logger=logger) + + # Folded Tracks Dictionary --> export also the Folded instances dictionary + folded_instances_dict = export_folded_instances(tracks_fold_indexing_dict=tracks_fold_indexing_dict, + class_name=class_name, + dataset_path=dataset_path, + logger=logger) + + concat_save_model_instances_matrix_json(instances_dict=folded_instances_dict, + cm_dict=folded_matrix_dict, + exports_path=exports_path, + logger=logger, + export_name="folded_dataset_instances_cm.json") + + simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", + logger=logger, + export_save_path=exports_path, + export_name="folded_simplified_matrix.json", + write_mode=True) + + logger.info("Simplified CM of the evaluated folded dataset:\n{}".format(simplified_cm)) + + # Evaluation to the folded Dataset + export_evaluation_results(config=config, + set_name="Folded", + y_true_values=df_predictions[class_name], + predictions=df_predictions["predictions"], + class_name=class_name, + exports_path=exports_path, + logger=logger + ) + + # ---------- TRAIN TO THE WHOLE DATASET WITH THE BEST CLASSIFIER ---------- + logger.info("Train the classifier with the whole dataset..") + clf.fit(features_prepared, y) + # prediction for the whole dataset + predictions_all = clf.predict(features_prepared) + # save the model that is trained to the whole dataset + best_model_path = os.path.join(exports_path, "best_clf_model.pkl") + joblib.dump(clf, best_model_path) + logger.info("Best model saved.") + + # export the matrix dictionary from the whole dataset + whole_matrix_dict = matrix_creation(classes=clf.classes_, + tracks=tracks, + y_actual=predictions_all, + y_hat=y, + logger=logger, + export_save_path=exports_path, + export_name="whole_dataset_results_matrix.json") + + simplified_cm_whole = simplified_matrix_export(best_result_file="whole_dataset_results_matrix.json", + logger=logger, + export_save_path=exports_path, + export_name="whole_dataset_cm_dict.json", + write_mode=True) + + logger.info("Simplified CM of the evaluated whole dataset:\n{}".format(simplified_cm_whole)) + + concat_save_model_instances_matrix_json(instances_dict=None, + cm_dict=whole_matrix_dict, + exports_path=exports_path, + logger=logger, + export_name="whole_dataset_instances_cm.json") + + # Evaluation to the whole Dataset + export_evaluation_results(config=config, + set_name="Whole", + y_true_values=y, + predictions=predictions_all, + class_name=class_name, + exports_path=exports_path, + logger=logger + ) + + +def concat_save_model_instances_matrix_json(instances_dict, cm_dict, exports_path, logger, export_name): + """ + Save the best model's folded instances and confusion matrix dictionary merged into one dictionary + + Args: + instances_dict: + cm_dict: + exports_path: + logger: + export_name: + + Returns: + + """ + if instances_dict: + # in case of the folded dataset where folds exist + best_folds_cm_merge_dict = {**instances_dict, **cm_dict} + else: + # in case of the whole datset where no folds exist + best_folds_cm_merge_dict = cm_dict + + # Serializing json + json_object_folds_cm = json.dumps(best_folds_cm_merge_dict, indent=4) + # Writing to json + load_file_path = os.path.join(exports_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object_folds_cm) + logger.info("Whole folded instaces and matrix dictionary stored successfully.") + + +def predictions_fold(clf, inner_cv, feats_prepared, y, tracks, class_name, logger): + """ + + Args: + clf: the classifier model object + inner_cv: the KFold object + feats_prepared: + y: the true values + tracks: + class_name: + logger: + + Returns: + tracks_fold_indexing_dict: + accuracy_model: + predictions_df_list: + """ + tracks_fold_indexing_dict = {} + accuracy_model = [] + predictions_df_list = [] + fold_number = 0 + for train_index, test_index in inner_cv.split(feats_prepared): + logger.info("FOLD {} - Analyzing, Fitting, Predicting".format(fold_number)) + logger.debug("first test index element: {} - last test index element: {}".format(test_index[0], test_index[-1])) + logger.debug("TEST INDEX: {}".format(test_index)) + logger.debug("Length of the test index array: {}".format(len(test_index))) + + # tracks indexing list for each fold + tracks_count = 0 + tracks_list = [] + for index in test_index: + tracks_fold_indexing_dict[tracks[index]] = fold_number + tracks_list.append(tracks[index]) + tracks_count += 1 + logger.debug("Tracks indexed to the specific fold: {}".format(tracks_count)) + X_train, X_test = feats_prepared[train_index], feats_prepared[test_index] + y_train, y_test = y[train_index], y[test_index] + # Train the model + clf.fit(X_train, y_train) + logger.debug("Classifier classes: {}".format(clf.classes_)) + # create a df for this fold with the predictions + df_pred_general = create_fold_predictions(clf=clf, + class_name=class_name, + X_test=X_test, + test_index=test_index, + tracks_list=tracks_list, + y_test=y_test, + logger=logger) + # Append the folded dataset to a list that will contain all the folded datasets + predictions_df_list.append(df_pred_general) + # Append each accuracy of the folded model to a list that contains all the accuracies resulted from each fold + accuracy_model.append(accuracy_score(y_test, clf.predict(X_test), normalize=True) * 100) + fold_number += 1 + + return predictions_df_list, accuracy_model, tracks_fold_indexing_dict + + +def create_fold_predictions(clf, class_name, X_test, test_index, tracks_list, y_test, logger): + """ + Creates a pandas DataFrame from each fold with the predictions in + order later to extract the shuffled dataset with the tracks, the percentage + of the prediction probability for each class, the prediction, and the true + value. + + Args: + clf: + class_name: + X_test: + test_index: + tracks_list: + y_test: + logger: + + Returns: + A pandas DataFrame with the predictions at each fold. + """ + # predictions for the features test + pred = clf.predict(X_test) + # predictions numpy array transformation to pandas DF + df_pred = pd.DataFrame(data=pred, index=test_index, columns=["predictions"]) + # predictions' probabilities + pred_prob = clf.predict_proba(X_test) + # predictions' probabilities numpy array transformation to pandas DF + df_pred_prob = pd.DataFrame(data=pred_prob, index=test_index, columns=clf.classes_) + # tracks list transformation to pandas DF + df_tracks = pd.DataFrame(data=tracks_list, index=test_index, columns=["track"]) + logger.debug("\n{}".format(df_tracks.head())) + # y_test pandas Series transformation to pandas DF + y_test_series = pd.DataFrame(data=y_test, index=test_index, columns=[class_name]) + # concatenate the 4 DFs above to 1 for saving the resulted dataset + # (tracks, predictions' probabilities, predictions, true) + logger.debug("Concatenating DF..") + df_pred_general = pd.concat([df_tracks, df_pred_prob, df_pred, y_test_series], axis=1, ignore_index=False) + + return df_pred_general + + +def export_accuracies(accuracy_model, config, class_name, exports_path, images_path, logger): + """ + + Args: + accuracy_model: + config: + class_name: + exports_path: + images_path: + logger: + + Returns: + + """ + logger.info("Accuracies in each fold: {}".format(accuracy_model)) + logger.info("Mean of accuracies: {}".format(np.mean(accuracy_model))) + logger.info("Standard Deviation of accuracies: {}".format(np.std(accuracy_model))) + accuracies_export = "Accuracies in each fold: {} \nMean of accuracies: {} \nStandard Deviation of accuracies: {}" \ + .format(accuracy_model, np.mean(accuracy_model), np.std(accuracy_model)) + export_report(config=config, + name="Accuracies results", + report=accuracies_export, + filename="accuracies_results_fold", + train_class=class_name, + exports_path=exports_path) + + # Visualize accuracy for each iteration in a distribution plot + create_accuracies_dist_plot(accuracies_list=accuracy_model, + images_path=images_path, + logger=logger) + + +def create_dataset_predictions(list_df_predictions, class_name, dataset_path, logger): + """ + Args: + list_df_predictions: + class_name: + dataset_path: + logger: + + Returns: + + """ + logger.info("Make Predictions DataFrame for all the folded instances together.") + df_concat_predictions = pd.concat(list_df_predictions) + logger.debug("\n{}".format(df_concat_predictions.head())) + logger.debug("Info:") + logger.debug("\n{}".format(df_concat_predictions.info())) + # save predictions df + logger.info("Saving the unified predictions DataFrame locally.") + df_concat_predictions.to_csv(os.path.join(dataset_path, "predictions_{}.csv".format(class_name))) + + return df_concat_predictions + + +def create_accuracies_dist_plot(accuracies_list, images_path, logger): + logger.info("Visualize accuracy for each iteration.") + list_folds = [] + counter_folds = 0 + for accuracy in accuracies_list: + list_folds.append("Fold{}".format(counter_folds)) + counter_folds += 1 + logger.debug("Exporting accuracies distribution to plot file..") + scores = pd.DataFrame(accuracies_list, columns=['Scores']) + sns.set(style="white", rc={"lines.linewidth": 3}) + sns.barplot(x=list_folds, y="Scores", data=scores) + plt.savefig(os.path.join(images_path, "accuracies_distribution.png")) + sns.set() + plt.close() + logger.info("Plot saved successfully.") + + +def export_folded_instances(tracks_fold_indexing_dict, class_name, dataset_path, logger): + logger.info("Writing Folded Tracks Dictionary locally to check where each track is folded..") + logger.debug("length of keys: {}".format(len(tracks_fold_indexing_dict.keys()))) + fold_dict = {"fold": tracks_fold_indexing_dict} + + # writing to yaml + folded_dataset_path_yml = os.path.join(dataset_path, "{}.yaml".format(class_name)) + with open(folded_dataset_path_yml, 'w') as file: + folded_dataset = yaml.dump(fold_dict, file) + + # Serializing json + json_object = json.dumps(fold_dict, indent=4) + # Writing to json + folded_dataset_path_json = os.path.join(dataset_path, "{}.json".format(class_name)) + with open(folded_dataset_path_json, "w") as outfile: + outfile.write(json_object) + + logger.info("Folded dataset written successfully to disk both in yaml and json format.") + + return fold_dict + + +def export_evaluation_results(config, set_name, y_true_values, predictions, class_name, exports_path, logger): + logger.info("---- Evaluation to the {} dataset ----".format(set_name)) + # Confusion Matrix + logger.info("Exporting Confusion Matrix applied to the {} dataset..".format(set_name)) + cm = confusion_matrix(y_true=y_true_values, y_pred=predictions) + logger.info("\n{}".format(cm)) + # Confusion Matrix Normalized + logger.info("Exporting Normalized Confusion Matrix applied to the {} dataset..".format(set_name)) + cm_normalized = (cm / cm.astype(np.float).sum(axis=1) * 100) + logger.info("\n{}".format(cm_normalized)) + cm_all = "Actual instances\n{}\n\nNormalized\n{}".format(cm, cm_normalized) + # export the confusion matrix report for the folded dataset + export_report(config=config, + name="{} Data Confusion Matrix".format(set_name), + report=cm_all, + filename="confusion_matrix_{}".format(set_name), + train_class=class_name, + exports_path=exports_path) + # Classification Report + logger.info("Exporting Classification Report applied to the {} dataset..".format(set_name)) + cr = classification_report(y_true=y_true_values, y_pred=predictions) + # export the Classification report for the whole dataset + export_report(config=config, + name="{} Data Classification Report".format(set_name), + report=cr, + filename="classification_report_{}".format(set_name), + train_class=class_name, + exports_path=exports_path) + logger.info("The {} dataset has been evaluated successfully.".format(set_name)) diff --git a/acousticbrainz/models/sklearn/classification/matrix_creation.py b/acousticbrainz/models/sklearn/classification/matrix_creation.py new file mode 100644 index 000000000..2c6f1e71f --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/matrix_creation.py @@ -0,0 +1,73 @@ +import os +import json +import numpy as np + + +def matrix_creation(classes, tracks, y_actual, y_hat, logger, export_save_path, export_name): + logger.info("MATRIX DICTIONARY CREATION") + # classes numpy array to list conversion + logger.info("CLASSES BEFORE CONVERSION {}".format(type(classes))) + classes = classes.tolist() + logger.info("CLASSES AFTER CONVERSION: {}".format(type(classes))) + logger.info("CLASSES: {}".format(classes)) + matrix_dict = {} + # print(type(y_actual)) + # print(type(y_hat)) + for pred_class in classes: + logger.info("Class process: {}".format(pred_class)) + # print("Class type:", type(pred_class)) + # pred_class = str(pred_class) + class_item_dict = {} + for track, actual, pred in zip(tracks, y_actual, y_hat): + if isinstance(actual, (int, np.int64)): + actual = int(actual) + if isinstance(pred, (int, np.int64)): + pred = int(pred) + if pred_class == actual == pred: + if actual not in class_item_dict: + class_item_dict[actual] = [] + class_item_dict[actual].append(track) + elif pred_class == actual and actual != pred: + if pred not in class_item_dict: + class_item_dict[pred] = [] + class_item_dict[pred].append(track) + matrix_dict[pred_class] = class_item_dict + logger.info("Matrix classified..") + matrix_general_dict = {"matrix": matrix_dict} + logger.debug("The whole matrix dictionary:\n{}".format(matrix_general_dict)) + + # Serializing json + json_object = json.dumps(matrix_general_dict, indent=4) + # Writing to sample.json + load_file_path = os.path.join(export_save_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object) + logger.info("Best results matrix stored successfully.") + + return matrix_general_dict + + +def simplified_matrix_export(best_result_file, logger, export_save_path, export_name, write_mode=False): + load_file_path = os.path.join(export_save_path, best_result_file) + logger.info("load best model results from JSON format file") + with open(load_file_path) as f: + data = json.load(f) + confusion_matrix = data['matrix'] + logger.info("Best model results loaded..") + + simplified_cm = {} + for key, val in confusion_matrix.items(): + simplified_cm[key] = {} + for predicted_key, predicted_val in val.items(): + simplified_cm[key][predicted_key] = len(predicted_val) + # export simplified matrix to JSON file + if write_mode is True: + # Serializing json + json_object = json.dumps(simplified_cm, indent=4) + # Writing to sample.json + load_file_path = os.path.join(export_save_path, export_name) + with open(load_file_path, "w") as outfile: + outfile.write(json_object) + logger.info("Best simplified matrix stored successfully.") + + return simplified_cm diff --git a/acousticbrainz/models/sklearn/classification/report_files_export.py b/acousticbrainz/models/sklearn/classification/report_files_export.py new file mode 100644 index 000000000..9fef07aad --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/report_files_export.py @@ -0,0 +1,19 @@ +import os +from datetime import datetime +from termcolor import colored + + +def export_report(config, name, report, filename, train_class, exports_path): + reports_path = os.path.join(exports_path, "reports") + # take current datetime + now = datetime.now() + datetime_str_verbose = now.isoformat() + print("Creating report file..") + with open(os.path.join(reports_path, "{}.txt".format(filename)), 'w+') as fp: + fp.write("Date of execution: {}".format(datetime_str_verbose)) + fp.write("\n\n") + fp.write(str(name)) + fp.write("\n\n") + fp.write(str(report)) + fp.close() + print(colored("{} file for class {} is created successfully.".format(name, train_class), "cyan")) diff --git a/acousticbrainz/models/sklearn/classification/train_class.py b/acousticbrainz/models/sklearn/classification/train_class.py new file mode 100644 index 000000000..15296f221 --- /dev/null +++ b/acousticbrainz/models/sklearn/classification/train_class.py @@ -0,0 +1,90 @@ +import os +from termcolor import colored +import yaml + +from ..helper_functions.logging_tool import setup_logger +from ..transformation.load_ground_truth import load_local_ground_truth, export_gt_tracks, create_df_tracks +from ..classification.classification_task_manager import ClassificationTaskManager + + +def train_class(config, gt_file, c_values, gamma_values, preprocessing_values, log_level): + exports_path = config["exports_path"] + ground_truth_data = load_local_ground_truth(gt_file) + # tracks shuffled and exported + tracks_listed_shuffled = export_gt_tracks(ground_truth_data, config.get("seed")) + + # class to train + class_name = ground_truth_data["className"] + config["class_name"] = class_name + print("EXPORT CLASS NAME: {}".format(class_name)) + + config = update_parameters(config=config, + c_values=c_values, + gamma_values=gamma_values, + preprocessing_values=preprocessing_values) + + logger = setup_logger( + exports_path=exports_path, + name="train_model_{}".format(class_name), + mode="w", + level=log_level + ) + + logger.info("---- TRAINING FOR THE {} MODEL HAS JUST STARTED ----".format(class_name)) + logger.debug("Type of exported GT data exported: {}".format(type(tracks_listed_shuffled))) + + # name the project file + if config["project_file"] is None: + prefix_project_file = "project" + project_file_name_save = "{}_{}.yaml".format(prefix_project_file, class_name) + else: + project_file_name_save = "{}.yaml".format(config["project_file"]) + logger.info("Project yaml file name: {}".format(project_file_name_save)) + # save the project file + project_file_save_path = os.path.join(exports_path, project_file_name_save) + with open(os.path.join(project_file_save_path), "w") as template_file: + template_data_write = yaml.dump(config, template_file) + + print("First N sample of shuffled tracks: \n{}".format(tracks_listed_shuffled[:4])) + + # create the exports with the features DF, labels, and tracks together + features, labels, tracks = create_df_tracks(config=config, + tracks_list=tracks_listed_shuffled, + train_class=class_name, + exports_path=exports_path, + logger=logger) + logger.debug("Types of exported files from GT:") + logger.debug("Type of features: {}".format(type(features))) + logger.debug("Type of labels: {}".format(type(labels))) + logger.debug("Type of Tracks: {}".format(type(tracks))) + + model_manage = ClassificationTaskManager(config=config, + train_class=class_name, + X=features, + y=labels, + tracks=tracks, + exports_path=exports_path, + logger=logger) + classification_time = model_manage.apply_processing() + print(colored("Classification ended successfully in {} minutes.".format(classification_time), "green")) + logger.info("Classification ended successfully in {} minutes.".format(classification_time)) + + +def update_parameters(config, c_values, gamma_values, preprocessing_values): + """Update the project file with user-provided preferences + + Args: + config: The config data to be updated. + c_values: C value to be updated. + gamma_values: gamma value to be updated. + preprocessing_values: preprocessing values to be updated. + """ + for pref in config['classifiers']['svm']: + if c_values: + pref['C'] = c_values + if gamma_values: + pref['gamma'] = gamma_values + if preprocessing_values: + pref['preprocessing'] = preprocessing_values + + return config diff --git a/acousticbrainz/models/sklearn/helper_functions/__init__.py b/acousticbrainz/models/sklearn/helper_functions/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/helper_functions/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/helper_functions/logging_tool.py b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py new file mode 100644 index 000000000..0c764de0e --- /dev/null +++ b/acousticbrainz/models/sklearn/helper_functions/logging_tool.py @@ -0,0 +1,57 @@ +""" +This file consists of the setup_logger methof that is used for logging. setup_logger() +method set up a new logger object with the related configurations. + +Typical usage example: + logger = setup_logger(logger_name, logging_file_location, level_of_logging) +""" +import logging +import os + +from acousticbrainz.models.sklearn.helper_functions.utils import create_directory + + +def setup_logger(exports_path, name, mode, level=logging.INFO): + """ + Function to set up as many loggers as you want. It exports the logging results to a file + in the relevant path that is determined by the configuration file. + + Args: + exports_path: The path (str) the logging exports will be exported. + name: The name (str) of the logger. + level: The level (int) of the logging. Defaults to logging.INFO. + mode: The mode (str) translated in write, append. Valid values ("w", "a") + + Returns: + The logger object. + """ + logs_path = create_directory(exports_path, "logs") + + # Create a custom logger + logger = logging.getLogger(name) + + # Create handlers + c_handler = logging.StreamHandler() + f_handler = logging.FileHandler(os.path.join(logs_path, "{}.log".format(name)), mode=mode) + + # Create formatters and add it to handlers + c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') + f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + c_handler.setFormatter(c_format) + f_handler.setFormatter(f_format) + + # if handlers are already present and if so, clear them before adding new handlers. This is pretty convenient + # when debugging and the code includes the logger initialization + if logger.hasHandlers(): + logger.handlers.clear() + + # Add handlers to the logger + logger.addHandler(c_handler) + logger.addHandler(f_handler) + + if level is None: + logger.setLevel(logging.INFO) + else: + logger.setLevel(level) + + return logger diff --git a/acousticbrainz/models/sklearn/helper_functions/utils.py b/acousticbrainz/models/sklearn/helper_functions/utils.py new file mode 100644 index 000000000..af08a7f86 --- /dev/null +++ b/acousticbrainz/models/sklearn/helper_functions/utils.py @@ -0,0 +1,90 @@ +import os +import yaml + + +def load_yaml(path_to_file, file): + """ + Args: + path_file: + + Returns: + The configuration data loaded from the template. + """ + try: + with open(os.path.join(path_to_file, file)) as fp: + config_data = yaml.load(fp, Loader=yaml.FullLoader) + if isinstance(config_data, dict): + return config_data + else: + return None + except ImportError: + print("WARNING: could not import yaml module") + return None + + +def create_directory(exports_path, directory): + # find dynamically the current script directory + full_path = os.path.join(exports_path, directory) + # create path directories if not exist --> else return the path + os.makedirs(full_path, exist_ok=True) + return full_path + + +def change_weights_val(i): + """ + Is is used in the TrainingProcesses class. It is used to transform each value of + the balanced classes list in the configuration file Grid parameters of the classifier: + * True --> balanced + * False --> None + Args: + i: The value inserted + Returns: + "balanced" in case the value of the list is True, else None if it is set to False. + """ + if i is True: + return "balanced" + elif i is False: + return None + return i + + +def extract_training_processes(config): + """ Extracts the pre-processing steps that are specified in "List of classifiers + to be trained" section of the configuration template. These are the amount + of the prep-processing steps with the relevant training that will be executed. + + Returns: + A list of the processes that have been identified with the corresponding parameter grid. + """ + evaluations = config["evaluations"]["nfoldcrossvalidation"] + print("Evaluations countered: {}".format(len(evaluations))) + evaluation_counter = 0 + trainings_counted = 0 + processes = [] + for evaluation in evaluations: + for nfold_number in evaluation["nfold"]: + classifiers = config["classifiers"]["svm"] + for classifier in classifiers: + for pre_processing in classifier["preprocessing"]: + for clf_type in classifier["type"]: + if clf_type == "C-SVC": + process_dict = { + "evaluation": evaluation_counter, + "classifier": clf_type, + "preprocess": pre_processing, + "kernel": [i.lower() for i in classifier["kernel"]], # lowercase the values + "C": [2 ** x for x in classifier["C"]], # 2 ** c + "gamma": [2 ** x for x in classifier["gamma"]], # 2 ** gamma + "balance_classes": [change_weights_val(i) for i in classifier["balance_classes"]], + "n_fold": nfold_number + } + # append the pre-processing steps list + processes.append(process_dict) + # increase counter by 1 + trainings_counted += 1 + # increase evaluation counter by 1 + evaluation_counter += 1 + + print("Trainings to be applied: {}".format(trainings_counted)) + + return processes diff --git a/acousticbrainz/models/sklearn/model/__init__.py b/acousticbrainz/models/sklearn/model/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/model/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/model/classification_project.py b/acousticbrainz/models/sklearn/model/classification_project.py new file mode 100644 index 000000000..6ac5bb887 --- /dev/null +++ b/acousticbrainz/models/sklearn/model/classification_project.py @@ -0,0 +1,63 @@ +import os +from ..helper_functions.utils import load_yaml +import time +from ..classification.train_class import train_class + + +def create_classification_project(ground_truth_file, dataset_dir, project_file=None, exports_path=None, + c_values=None, gamma_values=None, preprocessing_values=None, + seed=None, jobs=-1, verbose=1, logging="INFO"): + """ + Args: + ground_truth_file: The path (str) to the groundtruth yaml file of the dataset. It is required. + dataset_dir: The path to main datasets_dir containing the .json files. + project_file: The name (str) of the project configuration yaml file that + will be created. Default: None. If None, the tool will create + automatically a project file name in form of "project_CLASS_NAME", + where CLASS_NAME is the target class as referred to the groundtruth data. + exports_path: The path (str) where the results of the classification project will be saved to. + Default: None. If None, the exports directory will be saved inside the app folder. + seed: The seed (int) of the random shuffle generator. Default: 1 + jobs: The cores (int) that will be exploited during the training phase. + Default: -1. If -1, all the available cores will be used. + verbose: The verbosity (int) of the printed messages where this function + is available (for example in sklearn's GridSearch algorithm). Default: 1. + The higher the number the higher the verbosity. + logging: The level (str) of the logging prints. Default: "INFO". + Available values: DEBUG, INFO, WARNING, ERROR, CRITICAL. + """ + try: + path_template = os.path.dirname(os.path.realpath(__file__)) + project_template = load_yaml(path_template, "configuration_template.yaml") + except Exception as e: + print('Unable to open project configuration template:', e) + raise + + print("-------------------------------------------------------") + print() + if seed is None: + seed = time.time() + + print("Seed argument: {}".format(seed)) + + project_template["ground_truth_file"] = ground_truth_file + project_template["dataset_dir"] = dataset_dir + project_template["project_file"] = project_file + project_template["logging_level"] = logging + project_template["seed"] = seed + project_template["parallel_jobs"] = jobs + project_template["verbose"] = verbose + + # if empty, path is declared as the app's main directory + if exports_path is None: + exports_path = os.getcwd() + + print("Exports path: {}".format(exports_path)) + project_template["exports_path"] = exports_path + + print() + print() + print("-------------------------------------------------------") + + print("Loading GroundTruth yaml file:", ground_truth_file) + train_class(project_template, ground_truth_file, c_values, gamma_values, preprocessing_values, logging) diff --git a/acousticbrainz/models/sklearn/model/configuration_template.yaml b/acousticbrainz/models/sklearn/model/configuration_template.yaml new file mode 100644 index 000000000..adce65740 --- /dev/null +++ b/acousticbrainz/models/sklearn/model/configuration_template.yaml @@ -0,0 +1,110 @@ +# READ GROUND TRUTH +# the ground truth data directory +ground_truth_directory: +exports_path: +# classes with features locally: danceability, gender, genre_rosamerica, moods_claurier, moods_mirex, timbre_bright_dark +# classes with features locally: tonal_atonal, voice_instrumental +# classes with features online: genre_dortmund, genre_electronic, genre_tzanetakis, ismir04_rhythm, +class_name: +exports_directory: +logging_level: # logging level +seed: # set null to get the seed from the clock value, otherwise specify a number + +# PRE-PROCESSING +# List of parameters that have to be excluded before applying the transformation steps +excludedDescriptors: [ 'metadata.tags*' ] +# List of preprocessed datasets to build +processing: + # it is possible to not apply any processing, although this is of + # of little value in real-life tests and evaluations + raw: [] + + basic: + - transfo: remove + params: { descriptorNames: &unusedDescs [ 'metadata.*', '*dmean*', '*dvar*', + '*.min', '*.max', '*cov', + 'tonal.thpcp', # because of division by zero + 'lowlevel.spectral_energyband_high.*', # 0 for low samplerate + 'lowlevel.silence_rate*' # funky behavior in general + ] } + - transfo: enumerate + params: { descriptorNames: &stringDescs [ # 'rhythm.perceptual_tempo', # removed from new extractor + 'tonal.chords_key', 'tonal.chords_scale', + 'tonal.key_key', 'tonal.key_scale' ] } + + lowlevel: + # note that the order of the transformations is important! + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: select + params: { descriptorNames: ['lowlevel*'] } + + nobands: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: remove + params: { descriptorNames: [ 'barkbands*', '*energyband*', 'melbands*', 'erbbands*' ] } + + normalized: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: normalize # MixMax Scale + + gaussianized: + - transfo: remove + params: { descriptorNames: *unusedDescs } + - transfo: enumerate + params: { descriptorNames: *stringDescs } + - transfo: normalize # MixMax Scale + - transfo: gaussianize # QuantileTransformer + params: { descriptorNames: ['lowlevel.*'] } + +# mfcc: +# # an MFCC only baseline +# - transfo: remove +# params: { descriptorNames: *unusedDescs } +# - transfo: enumerate +# params: { descriptorNames: *stringDescs } +# - transfo: select +# params: { descriptorNames: ['lowlevel.mfcc*'] } + +## ML SETTINGS +# train kind: grid, svm, deep_learning, supervised_lb +train_kind: grid +k_fold_shuffle: False + +# GRID ML SETTINGS +# PCA number of best components +pca_n_components: .95 +parallel_jobs: # set to -1 if to exploit all processors. Set to null to exploit only 1 processor +verbose: # 0: no verbose, 1: simple information about the tasks completed, 2: full information of all the tasks + +# NEURAL NETWORK SETTINGS +# + +# List of classifiers to be trained +classifiers: + svm: + # first svm test combinations +# - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized', 'mfcc' ] + - preprocessing: [ 'basic', 'lowlevel', 'nobands', 'normalized', 'gaussianized'] + type: [ 'C-SVC' ] + kernel: [ 'poly', 'RBF' ] + C: [ -5, -3, -1, 1, 3, 5, 7, 9, 11 ] # will actually be 2**x + gamma: [ 3, 1, -1, -3, -5, -7, -9, -11 ] # will actually be 2**x + # if True, weight classes based on the number of elements + balance_classes: [False, True] + # descriptorNames: [ ['*.mean', '*.var'] ] + # more svm params combinations + # ... + +# List of evaluations to be performed +evaluations: + nfoldcrossvalidation: + - nfold: [ 5 ] \ No newline at end of file diff --git a/acousticbrainz/models/sklearn/model/predict.py b/acousticbrainz/models/sklearn/model/predict.py new file mode 100644 index 000000000..0e9d568fe --- /dev/null +++ b/acousticbrainz/models/sklearn/model/predict.py @@ -0,0 +1,132 @@ +import os +import requests +import argparse +from pprint import pprint +import joblib +import json +import pandas as pd +from ..helper_functions.utils import load_yaml +from ..transformation.utils_preprocessing import flatten_dict_full +from ..transformation.transform_predictions import TransformPredictions +from ..helper_functions.logging_tool import setup_logger + + +class Predict: + def __init__(self, config, track_low_level, log_level): + self.config = config + self.track_low_level = track_low_level + self.log_level = log_level + + self.class_name = "" + self.exports_path = "" + self.best_model = "" + self.track_feats = dict() + + self.load_best_model() + # self.setting_logger() + self.logger = "" + # self.flat_dict() + self.df_track = pd.DataFrame() + self.list_track = [] + + def load_best_model(self): + self.class_name = self.config["class_name"] + self.exports_path = self.config["exports_path"] + + # self.exports_path = os.path.join(self.exports_path, "{}_{}".format(self.exports_dir, self.class_name)) + best_model_path = os.path.join(self.exports_path, + "best_model_{}.json".format(self.class_name)) + # best_model_path = os.path.join(self.exports_dir, "models", "model_grid_{}.pkl".format[""]) + with open(best_model_path) as json_file: + self.best_model = json.load(json_file) + + def preprocessing(self): + self.logger = setup_logger( + exports_path=self.exports_path, + name="predict_{}".format(self.class_name), + mode="w", + level=self.log_level + ) + + self.logger.info("Best model:") + self.logger.info(self.best_model) + + self.logger.info("FLATTENING:") + try: + if 'beats_position' in self.track_low_level['rhythm']: + del self.track_low_level['rhythm']['beats_position'] + except Exception as e: + self.logger.warning("There is no 'rhythm' key in the low level data. Exception:", e) + + # data dictionary transformed to a fully flattened dictionary + self.track_feats = dict(flatten_dict_full(self.track_low_level)) + list_track = [] + list_track.append(self.track_feats) + self.logger.debug("DICT TO DATAFRAME:") + self.df_track = pd.DataFrame(data=list_track, columns=list_track[0].keys()) + self.logger.debug("TYPE of track structure: {}".format(type(self.df_track))) + # print(self.df_track) + # print("Shape of DF", self.df_track.shape) + + self.logger.info("PROCESSING:") + features_prepared = TransformPredictions(config=self.config, + df_feats=self.df_track, + process=self.best_model["preprocessing"], + train_class=self.class_name, + exports_path=self.exports_path, + logger=self.logger + ).post_processing() + self.logger.debug("Features shape after preparation: {}".format(features_prepared.shape)) + + # load the best grid model that is trained with a k-fold cross validation + # models_path = FindCreateDirectory(self.exports_path, + # os.path.join(self.exports_dir, "models")).inspect_directory() + # best_model_path = os.path.join(models_path, "model_grid_{}.pkl".format(self.best_model["preprocessing"])) + + # load the best model that is trained to the whole dataset + best_model_path = os.path.join(self.exports_path, "best_clf_model.pkl") + clf_loaded = joblib.load(best_model_path) + predicted = clf_loaded.predict(features_prepared) + predicted_prob = clf_loaded.predict_proba(features_prepared) + self.logger.info("Prediction: {}".format(predicted)) + self.logger.info("Classes: {}".format(clf_loaded.classes_)) + self.logger.info("Prediction probabilities: {}".format(predicted_prob)) + predict_list = [] + for pred, pred_probability in zip(predicted, predicted_prob): + predict_dict = dict() + predict_dict[self.class_name] = pred + predict_dict["score"] = max(pred_probability) + predict_dict["probabilities"] = dict(zip(clf_loaded.classes_, pred_probability)) + + predict_list.append(predict_dict) + + self.logger.info("Predictions for the track:") + self.logger.info("{}".format(predict_list)) + self.logger.debug("Output (Return) predict_list") + + return predict_list + + +def prediction(exports_path, project_file, mbid, log_level="INFO"): + # if empty, path is declared as the app's main directory + try: + project_data = load_yaml(exports_path, "{}.yaml".format(project_file)) + except Exception as e: + print('Unable to open project configuration file:', e) + raise + + url_api = "https://acousticbrainz.org/api/v1/{}/low-level".format(mbid) + response = requests.get(url=url_api) + track_low_level_data = response.json() + if track_low_level_data["metadata"]["tags"]["artist"][0]: + print("Artist:", track_low_level_data["metadata"]["tags"]["artist"][0]) + if track_low_level_data["metadata"]["tags"]["album"][0]: + print("Album:", track_low_level_data["metadata"]["tags"]["album"][0]) + if track_low_level_data["metadata"]["tags"]["title"][0]: + print("Title:", track_low_level_data["metadata"]["tags"]["title"][0]) + + prediction_track = Predict(config=project_data, + track_low_level=track_low_level_data, + log_level=log_level + ) + prediction_track.preprocessing() diff --git a/acousticbrainz/models/sklearn/requirements.txt b/acousticbrainz/models/sklearn/requirements.txt new file mode 100644 index 000000000..62242336b --- /dev/null +++ b/acousticbrainz/models/sklearn/requirements.txt @@ -0,0 +1,13 @@ +matplotlib==3.1.3 +numpy==1.18.1 +pandas==1.0.3 +PyYAML==5.3 +scikit-learn==0.23.1 +scipy==1.4.1 +seaborn==0.10.0 +dask==2.11.0 +dotty-dict==1.2.1 +termcolor==1.1.0 +joblib==0.15.1 +six==1.15.0 +requests==2.23.0 \ No newline at end of file diff --git a/acousticbrainz/models/sklearn/transformation/__init__.py b/acousticbrainz/models/sklearn/transformation/__init__.py new file mode 100644 index 000000000..40a96afc6 --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/acousticbrainz/models/sklearn/transformation/load_ground_truth.py b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py new file mode 100644 index 000000000..79b798b81 --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/load_ground_truth.py @@ -0,0 +1,112 @@ +import os +import yaml +import pandas as pd +from termcolor import colored +import random +from ..helper_functions.utils import create_directory +from ..transformation.load_low_level import create_low_level_features_df + + +def load_local_ground_truth(gt_filename): + """ Loads the the ground truth file. + + The Ground Truth data which contains the tracks and the corresponding + labels they belong to. The path to the related tracks' low-level data + (features in JSON format) can be extracted from this file too. + """ + with open(gt_filename, "r") as stream: + try: + ground_truth_data = yaml.safe_load(stream) + print("Ground truth file loaded.") + return ground_truth_data + except yaml.YAMLError as exc: + print("Error in loading the ground truth file.") + print(exc) + + +def export_gt_tracks(ground_truth_data, seed): + """ + It takes a dictionary of the tracks from the groundtruth and it transforms it + to a list of tuples (track, label). Then it shuffles the list based on the seed + specified in the configuration file, and returns that shuffled list. + + Returns: + A list of tuples with the tracks and their corresponding labels. + """ + labeled_tracks = ground_truth_data["groundTruth"] + tracks_list = [] + for track, label in labeled_tracks.items(): + tracks_list.append((track, label)) + print(colored("SEED is set to: {}".format(seed, "cyan"))) + random.seed(a=seed) + random.shuffle(tracks_list) + print("Listed tracks in GT file: {}".format(len(tracks_list))) + return tracks_list + + +def create_df_tracks(config, tracks_list, train_class, exports_path, logger): + """ + TODO: Description + Returns: + TODO: Description + """ + + logger.info("---- EXPORTING FEATURES - LABELS - TRACKS ----") + dataset_dir = config.get("dataset_dir") + print('DATASET-DIR', dataset_dir) + dirpath = os.path.join(os.getcwd(), dataset_dir) + low_level_list = list() + for (dirpath, dirnames, filenames) in os.walk(dirpath): + low_level_list += [os.path.join(dirpath, file) for file in filenames if file.endswith(".json")] + if len(low_level_list) != 0: + logger.info("Low-level features for the tracks found.") + # processing the names of the tracks that are inside both the GT file and the low-level json files + # list with the tracks that are included in the low-level json files + tracks_existing_list = [e for e in tracks_list for i in low_level_list if e[0] in i] + # list with the low-level json tracks' paths that are included in tracks list + tracks_existing_path_list = [i for e in tracks_list for i in low_level_list if e[0] in i] + logger.debug("tracks existed found: {}".format(len(tracks_existing_list))) + logger.debug("tracks_path existed found: {}".format(len(tracks_existing_path_list))) + logger.debug("{}".format(tracks_existing_list[:4])) + logger.debug("{}".format(tracks_existing_path_list[:4])) + logger.debug("The founded tracks tracks listed successfully.") + logger.debug("Generate random number within a given range of listed tracks:") + # Random number between 0 and length of listed tracks + random_num = random.randrange(len(tracks_existing_list)) + logger.debug("Check if the tracks are the same in the same random index in both lists") + logger.debug("{}".format(tracks_existing_list[random_num])) + logger.debug("{}".format(tracks_existing_path_list[random_num])) + + tracks_list = tracks_existing_list + # create the dataframe with tracks that are bothe in low-level files and the GT file + df_tracks = pd.DataFrame(data=tracks_list, columns=["track", train_class]) + logger.debug("Shape of tracks DF created before cleaning: {}".format(df_tracks.shape)) + logger.debug("Check the shape of a temporary DF that includes if there are any NULL values:") + logger.debug("{}".format(df_tracks[df_tracks.isnull().any(axis=1)].shape)) + + logger.debug("Drop rows with NULL values if they exist..") + if df_tracks[df_tracks.isnull().any(axis=1)].shape[0] != 0: + df_tracks.dropna(inplace=True) + logger.debug("Check if there are NULL values after the cleaning process:") + logger.debug("{}".format(df_tracks[df_tracks.isnull().any(axis=1)].shape)) + logger.debug("Re-index the tracks DF..") + df_tracks = df_tracks.reset_index(drop=True) + else: + logger.info("There are no NULL values found.") + + # export shuffled tracks to CSV format + tracks_path = create_directory(exports_path, "tracks_csv_format") + df_tracks.to_csv(os.path.join(tracks_path, "tracks_{}_shuffled.csv".format(train_class))) + logger.debug("DF INFO:") + logger.debug("{}".format(df_tracks.info())) + logger.debug("COLUMNS CONTAIN OBJECTS: {}".format( + df_tracks.select_dtypes(include=['object']).columns)) + + df_feats = create_low_level_features_df(tracks_existing_path_list, logger) + + y = df_tracks[train_class].values + logger.info("Features, Labels, and Tracks are exported successfully..") + return df_feats, y, df_tracks["track"].values + else: + logger.error("No low-level data found.") + return None, None, None diff --git a/acousticbrainz/models/sklearn/transformation/load_low_level.py b/acousticbrainz/models/sklearn/transformation/load_low_level.py new file mode 100644 index 000000000..d1698c81f --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/load_low_level.py @@ -0,0 +1,45 @@ +import json +import pandas as pd +from ..transformation.utils_preprocessing import flatten_dict_full + + +def create_low_level_features_df(list_path_tracks, logger): + """ + Creates the low-level DataFrame. Cleans also the low-level data from the unnecessary features before creating + the DF. + Returns: + The low-level features (pandas DataFrame) from all the tracks in the collection. + """ + logger.info("---- CREATE LOW LEVEL DATAFRAME ----") + + list_feats_tracks = [] + counter_items_transformed = 0 + + for track_low_level_path in list_path_tracks: + try: + with open(track_low_level_path) as f: + data_feats_item = json.load(f, strict=False) + except Exception: + logger.error("Exception occurred in loading file:", exc_info=True) + # remove unnecessary features data + try: + if 'beats_position' in data_feats_item['rhythm']: + del data_feats_item['rhythm']['beats_position'] + except KeyError: + logger.error("There is no 'rhythm' key in the low level data.", exc_info=True) + + # data dictionary transformed to a fully flattened dictionary + data_feats_item = flatten_dict_full(data_feats_item) + + # append to a full tracks features pandas df + list_feats_tracks.append(dict(data_feats_item)) + + counter_items_transformed += 1 + + # The dictionary's keys list is transformed to type + df_feats_tracks = pd.DataFrame(list_feats_tracks, columns=list(list_feats_tracks[0].keys())) + logger.debug("COLUMNS CONTAIN OBJECTS: \n{}".format( + df_feats_tracks.select_dtypes(include=['object']).columns)) + logger.info("Exporting low-level data (DataFrame)..") + return df_feats_tracks + diff --git a/acousticbrainz/models/sklearn/transformation/transform.py b/acousticbrainz/models/sklearn/transformation/transform.py new file mode 100644 index 000000000..2396bcd97 --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/transform.py @@ -0,0 +1,252 @@ +import pandas as pd +from termcolor import colored +import collections +import joblib +import os +import six + +from ..transformation.utils_preprocessing import list_descr_handler +from ..transformation.utils_preprocessing import feats_selector_list +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, QuantileTransformer +from sklearn.pipeline import FeatureUnion +from sklearn.pipeline import Pipeline + + +# avoid the module's method call deprecation +try: + collectionsAbc = six.moves.collections_abc +except AttributeError: + collectionsAbc = collections + + +class Transform: + def __init__(self, config, df_feats, process, train_class, exports_path, logger): + self.config = config + self.df_feats = df_feats + self.process = process + self.train_class = train_class + self.exports_path = exports_path + self.logger = logger + + self.list_features = [] + self.feats_cat_list = [] + self.feats_num_list = [] + self.df_cat = pd.DataFrame() + self.df_num = pd.DataFrame() + + self.feats_prepared = [] + + + def post_processing(self): + print(colored("PROCESS: {}".format(self.process), "cyan")) + self.logger.debug("PROCESS: {}".format(self.process)) + self.logger.debug("Process: {}".format(self.config["processing"][self.process])) + # list_preprocesses = [] + + self.list_features = list(self.df_feats.columns) + + models_path = os.path.join(self.exports_path, "models") + + # clean list + print(colored("Cleaning..", "yellow")) + self.logger.info("Cleaning..") + cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) + feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) + self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] + self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features))) + + # remove list + print(colored("Removing unnecessary features..", "yellow")) + self.logger.info("Removing unnecessary features..") + if self.config["processing"][self.process][0]["transfo"] == "remove": + remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) + feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) + self.list_features = [x for x in self.list_features if x not in feats_remove_list] + self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features))) + + # enumerate list + print(colored("Split numerical / categorical features..", "yellow")) + if self.config["processing"][self.process][1]["transfo"] == "enumerate": + enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) + self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) + self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] + self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + + # BASIC + if self.process == "basic": + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # LOW-LEVEL or MFCC + if self.process == "lowlevel" or self.process == "mfcc": + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # NOBANDS + if self.process == "nobands": + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + feats_rem_list = feats_selector_list(self.df_feats, sel_list) + self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # NORMALIZED + if self.process == "normalized": + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + num_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_num_list)), + ('minmax_scaler', MinMaxScaler()), + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_pipeline, os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + # GAUSSIANIZED + if self.process == "gaussianized": + gauss_list = list_descr_handler(self.config["processing"][self.process][3]["params"]["descriptorNames"]) + feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) + feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] + + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + self.logger.debug("List post-Num-No-Gauss feats: {}".format(len(feats_num_no_gauss_list))) + + num_norm_pipeline = Pipeline([ + ("selector_num", DataFrameSelector(self.feats_num_list)), + ("minmax_scaler", MinMaxScaler()) + ]) + + cat_pipeline = Pipeline([ + ('selector', DataFrameSelector(self.feats_cat_list)), + ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)) + ]) + + full_normalize_pipeline = FeatureUnion(transformer_list=[ + ("num_pipeline", num_norm_pipeline), + ("cat_pipeline", cat_pipeline) + ]) + + self.feats_prepared = full_normalize_pipeline.fit_transform(self.df_feats) + self.logger.debug("Feats prepared normalized shape: {}".format(self.feats_prepared.shape)) + # save pipeline + joblib.dump(full_normalize_pipeline, + os.path.join(models_path, "full_normalize_pipeline_{}.pkl".format(self.process))) + self.df_feats = pd.DataFrame(data=self.feats_prepared) + columns = list(self.df_feats.columns) + # print(columns) + select_rename_list = columns[:len(self.feats_num_list)] + select_rename_list = self.feats_num_list + select_no_rename_list = columns[len(self.feats_num_list):] + print(select_no_rename_list) + new_feats_columns = select_rename_list + select_no_rename_list + self.df_feats.columns = new_feats_columns + self.logger.debug("Normalized Features DF:") + self.logger.debug("\n{}".format(self.df_feats)) + self.logger.debug("Shape: {}".format(self.df_feats.shape)) + + feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] + + num_gauss_pipeline = Pipeline([ + ("gauss_sel_num", DataFrameSelector(feats_num_gauss_list)), + ("gauss_scaler", QuantileTransformer(n_quantiles=1000)) + ]) + + num_no_gauss_pipeline = Pipeline([ + ("gauss_sel_num", DataFrameSelector(feats_no_gauss_list)) + ]) + + full_gauss_pipeline = FeatureUnion(transformer_list=[ + ("num_gauss_pipeline", num_gauss_pipeline), + ("num_no_gauss_pipeline", num_no_gauss_pipeline) + ]) + + self.feats_prepared = full_gauss_pipeline.fit_transform(self.df_feats) + + # save pipeline + joblib.dump(full_gauss_pipeline, + os.path.join(models_path, "full_gauss_pipeline_{}.pkl".format(self.process))) + + return self.feats_prepared + + +# Create a class to select numerical or categorical columns +class DataFrameSelector(BaseEstimator, TransformerMixin): + def __init__(self, attribute_names): + self.attribute_names = attribute_names + + def fit(self, X, y=None): + return self + + def transform(self, X): + return X[self.attribute_names].values diff --git a/acousticbrainz/models/sklearn/transformation/transform_predictions.py b/acousticbrainz/models/sklearn/transformation/transform_predictions.py new file mode 100644 index 000000000..81072ef5f --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/transform_predictions.py @@ -0,0 +1,153 @@ +import pandas as pd +from termcolor import colored +import collections +import joblib +import os +import six + +from ..transformation.utils_preprocessing import list_descr_handler +from ..transformation.utils_preprocessing import feats_selector_list + +# avoid the module's method call deprecation +try: + collectionsAbc = six.moves.collections_abc +except AttributeError: + collectionsAbc = collections + + +class TransformPredictions: + def __init__(self, config, df_feats, process, train_class, exports_path, logger): + self.config = config + self.df_feats = df_feats + self.process = process + self.train_class = train_class + self.exports_path = exports_path + self.logger = logger + self.list_features = [] + self.feats_cat_list = [] + self.feats_num_list = [] + + self.feats_prepared = [] + + + def post_processing(self): + print(colored("PROCESS: {}".format(self.process), "cyan")) + # list_preprocesses = [] + + self.logger.debug("Track Features - Low Level: {}".format(self.df_feats)) + self.logger.debug("Shape of DF: {}".format(self.df_feats.shape)) + + self.list_features = list(self.df_feats.columns) + + models_path = os.path.join(self.exports_path, "models") + + # clean list + print(colored("Cleaning..", "yellow")) + cleaning_conf_list = list_descr_handler(self.config["excludedDescriptors"]) + self.logger.debug("cleaning list: {}".format(cleaning_conf_list)) + feats_clean_list = feats_selector_list(self.df_feats.columns, cleaning_conf_list) + self.list_features = [x for x in self.df_feats.columns if x not in feats_clean_list] + self.logger.debug("List after cleaning some feats: {}".format(len(self.list_features), "blue")) + + # remove list + print(colored("Removing unnecessary features..", "yellow")) + if self.config["processing"][self.process][0]["transfo"] == "remove": + remove_list = list_descr_handler(self.config["processing"][self.process][0]["params"]["descriptorNames"]) + feats_remove_list = feats_selector_list(self.df_feats.columns, remove_list) + self.list_features = [x for x in self.list_features if x not in feats_remove_list] + self.logger.debug("List after removing unnecessary feats: {}".format(len(self.list_features), "blue")) + + # enumerate list + print(colored("Removing unnecessary features..", "yellow")) + if self.config["processing"][self.process][1]["transfo"] == "enumerate": + enumerate_list = list_descr_handler(self.config["processing"][self.process][1]["params"]["descriptorNames"]) + self.feats_cat_list = feats_selector_list(self.list_features, enumerate_list) + self.logger.debug("Enumerating feats: {}".format(self.feats_cat_list)) + self.feats_num_list = [x for x in self.list_features if x not in self.feats_cat_list] + self.logger.debug("List Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List Cat feats: {}".format(len(self.feats_cat_list), "blue")) + + # BASIC + if self.process == "basic": + print(colored("Process doing: {}".format(self.process), "green")) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # LOW-LEVEL or MFCC + if self.process == "lowlevel" or self.process == "mfcc": + print(colored("Process doing: {}".format(self.process), "green")) + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + self.feats_num_list = feats_selector_list(self.feats_num_list, sel_list) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # NOBANDS + if self.process == "nobands": + print(colored("Process doing: {}".format(self.process), "green")) + sel_list = list_descr_handler(self.config["processing"][self.process][2]["params"]["descriptorNames"]) + feats_rem_list = feats_selector_list(self.df_feats, sel_list) + self.feats_num_list = [x for x in self.feats_num_list if x not in feats_rem_list] + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # NORMALIZED + if self.process == "normalized": + print(colored("Process doing: {}".format(self.process), "green")) + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + + # load pipeline + full_pipeline = joblib.load(os.path.join(models_path, "full_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_pipeline.transform(self.df_feats) + + # GAUSSIANIZED + if self.process == "gaussianized": + print(colored("Process doing: {}".format(self.process), "green")) + gauss_list = list_descr_handler(self.config["processing"][self.process][3]["params"]["descriptorNames"]) + feats_num_gauss_list = feats_selector_list(self.feats_num_list, gauss_list) + feats_num_no_gauss_list = [x for x in self.feats_num_list if x not in feats_num_gauss_list] + + self.logger.debug("List post-Num feats: {}".format(len(self.feats_num_list))) + self.logger.debug("List post-Num-Gauss feats: {}".format(len(feats_num_gauss_list))) + + # load normalization pipeline + # full_pipeline = joblib.load(os.path.join(exports_dir, "full_pipeline_{}.pkl".format(self.process))) + full_normalize_pipeline = joblib.load(os.path.join(models_path, + "full_normalize_pipeline_{}.pkl".format(self.process))) + # normalize + self.feats_prepared = full_normalize_pipeline.transform(self.df_feats) + + # transform numpy array to pandas DF for guassianizing + self.df_feats = pd.DataFrame(data=self.feats_prepared) + columns = list(self.df_feats.columns) + # print(columns) + select_rename_list = columns[:len(self.feats_num_list)] + select_rename_list = self.feats_num_list + select_no_rename_list = columns[len(self.feats_num_list):] + self.logger.debug("Selected no rename list: {}".format(select_no_rename_list)) + new_feats_columns = select_rename_list + select_no_rename_list + self.df_feats.columns = new_feats_columns + self.logger.debug("Normalized Features DF:") + self.logger.debug("\n{}".format(self.df_feats)) + self.logger.debug("Shape: {}".format(self.df_feats.shape)) + # feats_no_gauss_list = [x for x in new_feats_columns if x not in feats_num_gauss_list] + + # load guassianization pipeline + full_gauss_pipeline = joblib.load(os.path.join(models_path, + "full_gauss_pipeline_{}.pkl".format(self.process))) + + self.feats_prepared = full_gauss_pipeline.transform(self.df_feats) + + return self.feats_prepared diff --git a/acousticbrainz/models/sklearn/transformation/utils_preprocessing.py b/acousticbrainz/models/sklearn/transformation/utils_preprocessing.py new file mode 100644 index 000000000..097beceab --- /dev/null +++ b/acousticbrainz/models/sklearn/transformation/utils_preprocessing.py @@ -0,0 +1,70 @@ +import re +import collections + + +def flatten_dict_full(dictionary, sep="_"): + """ + Args: + dictionary: + sep: + + Returns: + + """ + obj = collections.OrderedDict() + + def recurse(t, parent_key=""): + if isinstance(t, list): + for i in range(len(t)): + recurse(t[i], parent_key + sep + str(i) if parent_key else str(i)) + elif isinstance(t, dict): + for k, v in t.items(): + recurse(v, parent_key + sep + k if parent_key else k) + else: + obj[parent_key] = t + + recurse(dictionary) + + return obj + + +def list_descr_handler(descr_list): + """ + Args: + descr_list: + + Returns: + + """ + keys_list_handle = [] + for item in descr_list: + if item.endswith(".*"): + item = item.replace(".*", "_") + elif item.startswith("*."): + item = item.replace("*.", "_") + else: + item = item.replace("*", "") + item = item.replace(".", "_") + keys_list_handle.append(item) + return keys_list_handle + + +def feats_selector_list(df_feats_columns, feats_select_list): + """ + Args: + df_feats_columns: + feats_select_list: + + Returns: + + """ + columns_list = list(df_feats_columns) + columns_select_list = [] + counter_feats = 0 + for item in feats_select_list: + for sel_item in columns_list: + if re.search(item, sel_item): + columns_select_list.append(sel_item) + counter_feats += 1 + print("features selected: {}".format(counter_feats)) + return columns_select_list diff --git a/admin/sql/create_indexes.sql b/admin/sql/create_indexes.sql index 1d5d681ab..4d71b9dd5 100644 --- a/admin/sql/create_indexes.sql +++ b/admin/sql/create_indexes.sql @@ -26,4 +26,6 @@ CREATE UNIQUE INDEX lower_musicbrainz_id_ndx_user ON "user" (lower(musicbrainz_i CREATE INDEX collected_ndx_statistics ON statistics (collected); +CREATE INDEX training_tool_dataset_eval_jobs ON dataset_eval_jobs((options->>'training_tool')); + COMMIT; diff --git a/admin/updates/20200924-dataset-eval-job-tool-index.sql b/admin/updates/20200924-dataset-eval-job-tool-index.sql new file mode 100644 index 000000000..4667ed757 --- /dev/null +++ b/admin/updates/20200924-dataset-eval-job-tool-index.sql @@ -0,0 +1,3 @@ +BEGIN; +CREATE INDEX training_tool_dataset_eval_jobs ON dataset_eval_jobs((options->>'training_tool')); +COMMIT; \ No newline at end of file diff --git a/config.py.example b/config.py.example index 2a202d6a5..aaa358547 100644 --- a/config.py.example +++ b/config.py.example @@ -66,4 +66,7 @@ FEATURE_EVAL_FILTERING = True # Choose settings used for model training FEATURE_EVAL_MODEL_SELECTION = False +# Choose the ML tool used for model training (gaia/sklearn) +FEATURE_EVAL_TOOL_SELECTION = False + DEBUG_TB_INTERCEPT_REDIRECTS = False diff --git a/dataset_eval/artistfilter.py b/dataset_eval/artistfilter.py index a8162ac2c..c534edaf3 100644 --- a/dataset_eval/artistfilter.py +++ b/dataset_eval/artistfilter.py @@ -1,3 +1,4 @@ +from __future__ import print_function import collections import json import logging @@ -28,7 +29,7 @@ def print_datadict_summary(datadict): for r, cls in datadict.items(): counter[cls] += 1 for cls, count in counter.most_common(): - print "%s\t\t%s" % (cls, count) + print("%s\t\t%s" % (cls, count)) def normalise_datadict(datadict, cut_to): """Take a dictionary of groundtruth and cut all classes to diff --git a/dataset_eval/evaluate.py b/dataset_eval/evaluate.py index 31ebd6fe6..d08258d34 100644 --- a/dataset_eval/evaluate.py +++ b/dataset_eval/evaluate.py @@ -7,7 +7,6 @@ import tempfile import time -import gaia2.fastyaml as yaml from flask import current_app import db @@ -16,8 +15,21 @@ import db.dataset_eval import db.exceptions import utils.path +import yaml from dataset_eval import artistfilter -from dataset_eval import gaia_wrapper + +eval_tool_use = "gaia" +is_sklearn = os.getenv("MODEL_TRAINING_SKLEARN") +if is_sklearn == "1": + from acousticbrainz.models.sklearn.model.classification_project import create_classification_project + from acousticbrainz.models.sklearn.classification.matrix_creation import simplified_matrix_export + eval_tool_use = "sklearn" + +is_gaia = os.getenv("MODEL_TRAINING_GAIA") +if is_gaia == "1": + # import gaia2.fastyaml as yaml + from dataset_eval import gaia_wrapper + eval_tool_use = "gaia" SLEEP_DURATION = 30 # number of seconds to wait between runs @@ -25,9 +37,11 @@ def main(): logging.info("Starting dataset evaluator...") dataset_dir = current_app.config["DATASET_DIR"] + logging.info("Dataset dir path: {}".format(dataset_dir)) storage_dir = os.path.join(current_app.config["FILE_STORAGE_DIR"], "history") + logging.info("Storage dir path: {}".format(storage_dir)) while True: - pending_job = db.dataset_eval.get_next_pending_job() + pending_job = db.dataset_eval.get_next_pending_job(eval_tool_use) if pending_job: logging.info("Processing job %s..." % pending_job["id"]) evaluate_dataset(pending_job, dataset_dir, storage_dir) @@ -41,7 +55,10 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): eval_location = os.path.join(os.path.abspath(dataset_dir), eval_job["id"]) utils.path.create_path(eval_location) - temp_dir = tempfile.mkdtemp() + temp_dir = os.path.join(eval_location, 'temp') + utils.path.create_path(temp_dir) + + training_tool = eval_job["options"].get("training_tool", "gaia") try: snapshot = db.dataset.get_snapshot(eval_job["snapshot_id"]) @@ -49,36 +66,32 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): train, test = artistfilter.filter(eval_job["snapshot_id"], eval_job["options"]) db.dataset_eval.add_sets_to_job(eval_job["id"], train, test) - logging.info("Generating filelist.yaml and copying low-level data for evaluation...") - filelist_path = os.path.join(eval_location, "filelist.yaml") - filelist = dump_lowlevel_data(train.keys(), temp_dir) - with open(filelist_path, "w") as f: - yaml.dump(filelist, f) + if training_tool == "gaia": + logging.info("Generating filelist.yaml and copying low-level data for evaluation...") + filelist_path = os.path.join(eval_location, "filelist.yaml") + filelist = dump_lowlevel_data(train.keys(), temp_dir) + with open(filelist_path, "w") as f: + yaml.safe_dump(filelist, f) + elif training_tool == "sklearn": + dump_lowlevel_data_sklearn(train.keys(), dataset_dir) logging.info("Generating groundtruth.yaml...") groundtruth_path = os.path.join(eval_location, "groundtruth.yaml") with open(groundtruth_path, "w") as f: - yaml.dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) - - # Passing more user preferences to train the model. - logging.info("Training model...") - results = gaia_wrapper.train_model( - project_dir=eval_location, - groundtruth_file=groundtruth_path, - filelist_file=filelist_path, - c_values=eval_job["options"].get("c_values", []), - gamma_values=eval_job["options"].get("gamma_values", []), - preprocessing_values=eval_job["options"].get("preprocessing_values", []), - ) - logging.info("Saving results...") - save_history_file(storage_dir, results["history_path"], eval_job["id"]) - db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ - "project_path": eval_location, - "parameters": results["parameters"], - "accuracy": results["accuracy"], - "confusion_matrix": results["confusion_matrix"], - "history_path": results["history_path"], - })) + yaml.safe_dump(create_groundtruth_dict(snapshot["data"]["name"], train), f) + + if training_tool == "gaia": + logging.info("Training GAIA model...") + evaluate_gaia(eval_job["options"], eval_location, groundtruth_path, filelist_path, storage_dir, eval_job) + elif training_tool == "sklearn": + logging.info("Training SKLEARN model...") + evaluate_sklearn(options=eval_job["options"], + eval_location=eval_location, + ground_truth_file=groundtruth_path, + dataset_dir=dataset_dir, + storage_dir=storage_dir, + eval_job=eval_job) + db.dataset_eval.set_job_status(eval_job["id"], db.dataset_eval.STATUS_DONE) logging.info("Evaluation job %s has been completed." % eval_job["id"]) @@ -92,24 +105,101 @@ def evaluate_dataset(eval_job, dataset_dir, storage_dir): ) logging.info(e) - finally: - # Clean up the source files used to generate this model. - # We can recreate them from the database if we need them - # at a later stage. - shutil.rmtree(temp_dir) + +def evaluate_gaia(options, eval_location, groundtruth_path, filelist_path, storage_dir, eval_job): + results = gaia_wrapper.train_model( + project_dir=eval_location, + groundtruth_file=groundtruth_path, + filelist_file=filelist_path, + c_values=options.get("c_values", []), + gamma_values=options.get("gamma_values", []), + preprocessing_values=options.get("preprocessing_values", []) + ) + logging.info("Saving results...") + save_history_file(storage_dir, results["history_path"], eval_job["id"]) + db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ + "project_path": eval_location, + "parameters": results["parameters"], + "accuracy": results["accuracy"], + "confusion_matrix": results["confusion_matrix"], + "history_path": results["history_path"], + })) + + +def evaluate_sklearn(options, eval_location, ground_truth_file, dataset_dir, storage_dir, eval_job): + create_classification_project(ground_truth_file=ground_truth_file, + dataset_dir=dataset_dir, + project_file=eval_job["id"], + exports_path=eval_location, + c_values=options.get("c_values", []), + gamma_values=options.get("gamma_values", []), + preprocessing_values=options.get("preprocessing_values", []) + ) + + logging.info("Saving results...") + results = load_best_results_sklearn(exported_path=eval_location, + project_file=eval_job["id"]) + db.dataset_eval.set_job_result(eval_job["id"], json.dumps({ + "project_path": eval_location, + "parameters": results["parameters"], + "accuracy": results["accuracy"], + "confusion_matrix": results["confusion_matrix"], + "model": results["model"], + })) + + +def load_best_results_sklearn(exported_path, project_file): + project_conf_file_path = os.path.join(exported_path, "{}.yaml".format(project_file)) + logging.info("Config file path: {}".format(project_conf_file_path)) + with open(project_conf_file_path) as fp: + project_data = yaml.load(fp, Loader=yaml.FullLoader) + logging.info("Model: {}".format(project_data['class_name'])) + + # load the best model dictionary + best_model_path = os.path.join(exported_path, "best_model_{}.json".format(project_data['class_name'])) + logging.info("Best model path: {}".format(best_model_path)) + with open(best_model_path) as json_file: + data_best_model = json.load(json_file) + + # load the best model's instances and matrix dictionary + fold_matrix_path = os.path.join(exported_path, "folded_dataset_instances_cm.json") + logging.info("Best Instances and Matrix JSON path: {}".format(fold_matrix_path)) + with open(fold_matrix_path) as json_file_cm: + data_fold_matrix = json.load(json_file_cm) + + # load the best model's simplified matrix dictionary + # fold_simplified_matrix_path = os.path.join(exported_path, project_file, "folded_simplified_matrix.json") + # logging.info(f"Best models simplified matrix JSON path: {fold_simplified_matrix_path}") + # with open(fold_simplified_matrix_path) as json_file_simple_cm: + # data_fold_simplified_matrix = json.load(json_file_simple_cm) + + # export the matrix dictionary from the folded dataset + simplified_cm = simplified_matrix_export(best_result_file="folded_dataset_results_matrix.json", + logger=logging, + export_save_path=exported_path, + export_name="simplified_cm.json", + write_mode=False) + + return { + "parameters": data_best_model["params"], + # for consistency with gaia which reports accuracy on scale of 0 to 100 + "accuracy": round(data_best_model["score"] * 100, 2), + "confusion_matrix": simplified_cm, + "model": os.path.join(exported_path, "best_clf_model.pkl") # path to best model pickle file + } def create_groundtruth_dict(name, datadict): groundtruth = { "type": "unknown", # TODO: See if that needs to be modified. "version": 1.0, - "className": db.dataset._slugify(unicode(name)), + "className": db.dataset._slugify(name), "groundTruth": {}, } for r, cls in datadict.items(): - if isinstance(r, unicode): - r = r.encode("UTF-8") - groundtruth["groundTruth"][r] = cls.encode("UTF-8") + # if isinstance(r, unicode): + # r = r.encode("UTF-8") + groundtruth["groundTruth"][r] = cls return groundtruth @@ -118,12 +208,12 @@ def create_groundtruth(dataset): groundtruth = { "type": "unknown", # TODO: See if that needs to be modified. "version": 1.0, - "className": db.dataset._slugify(unicode(dataset["name"])), + "className": db.dataset._slugify(dataset["name"]), "groundTruth": {}, } for cls in dataset["classes"]: for recording_mbid in cls["recordings"]: - groundtruth["groundTruth"][recording_mbid] = cls["name"].encode("UTF-8") + groundtruth["groundTruth"][recording_mbid] = cls["name"] return groundtruth @@ -159,7 +249,41 @@ def lowlevel_data_to_yaml(data): if 'lossless' in data['metadata']['audio_properties']: del data['metadata']['audio_properties']['lossless'] - return yaml.dump(data) + return yaml.safe_dump(data) + + +def dump_lowlevel_data_sklearn(recordings, location): + """Dumps low-level data to JSON for all recordings into specified location. + + Args: + recordings: List of MBIDs of recordings. + location: Path to directory where low-level data will be saved. + + """ + utils.path.create_path(location) + filelist = {} + for recording in recordings: + logging.info("Recording: {}".format(recording)) + filelist[recording] = os.path.join(location, "%s.json" % recording) + logging.info("Recoding path: {}".format(filelist[recording])) + with open(filelist[recording], 'w') as outfile: + json.dump(lowlevel_data_cleaning(db.data.load_low_level(recording)), outfile) + logging.info("JSON data stored successfully.") + + +def lowlevel_data_cleaning(data): + """Prepares dictionary with low-level data about recording for processing. + """ + # Removing descriptors, that will otherwise break gaia_fusion due to + # incompatibility of layouts (see Gaia implementation for more details). + if "tags" in data["metadata"]: + del data["metadata"]["tags"] + if "sample_rate" in data["metadata"]["audio_properties"]: + del data["metadata"]["audio_properties"]["sample_rate"] + if 'lossless' in data['metadata']['audio_properties']: + del data['metadata']['audio_properties']['lossless'] + # logging.info("Data: {}".format(data)) + return data def extract_recordings(dataset): diff --git a/db/dataset_eval.py b/db/dataset_eval.py index aed4e97d3..f8723deb8 100644 --- a/db/dataset_eval.py +++ b/db/dataset_eval.py @@ -46,7 +46,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_values=None, - preprocessing_values=None, filter_type=None): + preprocessing_values=None, filter_type=None, training_tool="gaia"): """Add dataset into evaluation queue. Args: @@ -67,6 +67,7 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ filter_type: Optional filtering that will be applied to the dataset. See FILTER_* variables in this module for a list of existing filters. + training_tool (optional): The tool to use to train the model (gaia or sklearn) Raises: JobExistsException: if the dataset has already been submitted for evaluation @@ -90,7 +91,8 @@ def evaluate_dataset(dataset_id, normalize, eval_location, c_values=None, gamma_ # Validate dataset contents validate_dataset_contents(db.dataset.get(dataset_id)) return _create_job(connection, dataset_id, normalize, eval_location, - c_values, gamma_values, preprocessing_values, filter_type) + c_values, gamma_values, preprocessing_values, filter_type, + training_tool) def job_exists(dataset_id): @@ -164,7 +166,7 @@ def validate_dataset_contents(dataset): ) -def get_next_pending_job(): +def get_next_pending_job(training_tool="gaia"): """ Get the earliest submitted job which is still in the pending state. @@ -179,10 +181,11 @@ def get_next_pending_job(): ON dataset_snapshot.id = dataset_eval_jobs.snapshot_id WHERE status = :status AND eval_location = 'local' + AND options->>'training_tool' = :training_tool ORDER BY created ASC LIMIT 1 """ % EVAL_COLUMNS_COMMA_SEPARATED) - result = connection.execute(query, {"status": STATUS_PENDING}) + result = connection.execute(query, {"status": STATUS_PENDING, "training_tool": training_tool}) row = result.fetchone() return dict(row) if row else None @@ -330,7 +333,7 @@ def add_dataset_eval_set(connection, data): def _create_job(connection, dataset_id, normalize, eval_location, c_value, - gamma_value, preprocessing_values, filter_type): + gamma_value, preprocessing_values, filter_type, training_tool): if not isinstance(normalize, bool): raise ValueError("Argument 'normalize' must be a boolean.") if filter_type is not None: @@ -345,6 +348,7 @@ def _create_job(connection, dataset_id, normalize, eval_location, c_value, "c_values": c_value, "gamma_values": gamma_value, "preprocessing_values": preprocessing_values, + "training_tool": training_tool } snapshot_id = db.dataset.create_snapshot(dataset_id) diff --git a/db/test/test_dataset_eval.py b/db/test/test_dataset_eval.py index 794bc4f87..20eb6412a 100644 --- a/db/test/test_dataset_eval.py +++ b/db/test/test_dataset_eval.py @@ -87,7 +87,7 @@ def test_create_job_nonormalize(self): # No dataset normalization job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, False, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -98,7 +98,7 @@ def test_create_job_normalize(self): # dataset normalization job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -109,7 +109,7 @@ def test_create_job_artistfilter(self): # Artist filtering as an option job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, False, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=dataset_eval.FILTER_ARTIST) + filter_type=dataset_eval.FILTER_ARTIST, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -120,7 +120,7 @@ def test_create_job_svm_params(self): # C, gamma, and preprocessing values job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=dataset_eval.FILTER_ARTIST) + filter_type=dataset_eval.FILTER_ARTIST, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertIsNotNone(job) @@ -134,27 +134,27 @@ def test_create_job_badfilter(self): with self.assertRaises(ValueError): dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type="test") + filter_type="test", training_tool="gaia") def test_create_job_badlocation(self): # an invalid eval_location with self.assertRaises(ValueError): dataset_eval._create_job(self.conn, self.test_dataset_id, True, "not_a_location", c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") def test_job_exists(self): self.assertFalse(dataset_eval.job_exists(self.test_dataset_id)) dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") self.assertTrue(dataset_eval.job_exists(self.test_dataset_id)) def test_get_job(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") random_id = "f47ac10b-58cc-4372-a567-0e02b2c3d479" # just in case self.assertNotEqual(random_id, job_id) @@ -164,7 +164,7 @@ def test_get_job(self): def test_set_job_result(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") result = { u"accuracy": 1, @@ -182,7 +182,7 @@ def test_set_job_result(self): def test_set_job_status(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job = dataset_eval.get_job(job_id) self.assertEqual(job["status"], dataset_eval.STATUS_PENDING) @@ -196,12 +196,12 @@ def test_set_job_status(self): def test_get_next_pending_job(self): job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) next_pending = dataset_eval.get_next_pending_job() @@ -218,12 +218,12 @@ def test_get_next_pending_job_remote(self): # If we have a remote pending job with the most recent timestamp, skip it job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) next_pending = dataset_eval.get_next_pending_job() @@ -235,7 +235,7 @@ def test_delete_job(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") snapshots = dataset.get_snapshots_for_dataset(self.test_dataset_id) self.assertEqual(len(snapshots), 1) self.assertIsNotNone(dataset_eval.get_job(job_id)) @@ -247,13 +247,13 @@ def test_delete_job(self): def test_eval_job_location(self): job1_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job1 = dataset_eval.get_job(job1_id) self.assertEqual(job1["eval_location"], dataset_eval.EVAL_REMOTE) job2_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job2 = dataset_eval.get_job(job2_id) self.assertEqual(job2["eval_location"], dataset_eval.EVAL_LOCAL) @@ -262,7 +262,7 @@ def test_get_remote_pending_jobs_for_user(self): job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job_details = db.dataset_eval.get_job(job_id) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) @@ -277,7 +277,7 @@ def test_get_pending_jobs_for_user_local(self): """ Check that a local eval dataset for this user doesn't show """ job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_LOCAL, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") job_details = db.dataset_eval.get_job(job_id) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) @@ -290,7 +290,7 @@ def test_get_pending_jobs_for_user_other_user(self): another_dataset_id = dataset.create_from_dict(self.test_data, author_id=another_user_id) job_id = dataset_eval._create_job(self.conn, another_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) self.assertEqual(response, []) @@ -299,7 +299,7 @@ def test_get_pending_jobs_for_user_done(self): """ Check that a remote eval job with a done status doesn't show """ job_id = dataset_eval._create_job(self.conn, self.test_dataset_id, True, dataset_eval.EVAL_REMOTE, c_value=[1, 2, 3], gamma_value=[4, 5, 6], preprocessing_values=["basic"], - filter_type=None) + filter_type=None, training_tool="gaia") db.dataset_eval.set_job_status(job_id, db.dataset_eval.STATUS_DONE) response = dataset_eval.get_remote_pending_jobs_for_user(self.test_user_id) diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index c900ff68c..ec4945f9d 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -55,6 +55,8 @@ services: context: .. dockerfile: Dockerfile target: acousticbrainz-dev + environment: + MODEL_TRAINING_GAIA: 1 command: python2 worker_manage.py dataset_evaluator volumes: - ../:/code @@ -62,3 +64,18 @@ services: - ../data/files:/data/files depends_on: - db + + dataset_evaluator_sklearn: + build: + context: .. + dockerfile: Dockerfile.py3 + target: acousticbrainz-sklearn + environment: + MODEL_TRAINING_SKLEARN: 1 + command: python3 worker_manage.py dataset_evaluator + volumes: + - ../:/code + - ../data/datasets:/data/datasets + - ../data/files:/data/files + depends_on: + - db diff --git a/requirements.txt b/requirements.txt index 98505d72d..1e6ffc2c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ Flask-Login==0.5.0 Flask-SQLAlchemy==2.4.1 Flask-Testing==0.8.0 Flask-WTF == 0.14.3 -futures==3.3.0 +futures == 3.3.0; python_version < '3.0' mock==3.0.5 musicbrainzngs==0.7.1 ndg-httpsclient==0.5.1 diff --git a/sklearn_manage.py b/sklearn_manage.py new file mode 100644 index 000000000..9593d2f13 --- /dev/null +++ b/sklearn_manage.py @@ -0,0 +1,77 @@ +import click + +from acousticbrainz.models.sklearn.model.classification_project import create_classification_project +from acousticbrainz.models.sklearn.model.predict import prediction + +cli = click.Group() + +@cli.command(name="classification_project") +@click.option("--ground-truth-file", "-g", + help="Path of the dataset's groundtruth file/s.", required=True) +@click.option("--low-level-dir", "-d", required=True, + help="Path of the main datasets dir containing .json file/s.") +@click.option("--project-file", "-f", + help="Name of the project configuration file (.yaml) will be stored. If " + "not specified it takes automatically the name .") +@click.option("--export-path", "-o", + help="Path where the project results will be stored. If empty, the results " + "will be saved in the main app directory.") +@click.option("--seed", "-s", type=int, default=None, + help="Seed is used to generate the random shuffled dataset applied " + "later to folding.") +@click.option("--jobs", "-j", default=-1, type=int, + help="Parallel jobs. Set to -1 to use all the available cores") +@click.option("--verbose", "-v", default=1, type=int, + help="Controls the verbosity: the higher, the more messages.") +@click.option("--logging", "-l", default="INFO", + type=click.Choice( + ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + case_sensitive=False + ), help="The logging level that will be printed") +def classification_project(ground_truth_file, low_level_dir, project_file, export_path, + seed, jobs, verbose, logging): + """ Generates a project configuration file given a filelist, a groundtruth file, + and the directories to store the datasets and the results files. The script has + a parameter to specify the project template to use. If it is not specified, it + will try to guess the appropriated one from the essentia version found on the + descriptor files. + """ + create_classification_project( + ground_truth_file=ground_truth_file, + dataset_dir=low_level_dir, + project_file=project_file, + exports_path=export_path, + seed=seed, + jobs=jobs, + verbose=verbose, + logging=logging + ) + + +@cli.command(name="predict") +@click.option("--project-file", "-f", required=True, + help="Name of the project configuration file (.yaml) that is to be loaded. " + "The .yaml at the end of the file is not necessary. Just put the name " + "of the file.") +@click.option("--export-path", "-o", + help="Path where the project results will be stored. If empty, the results " + "will be saved in the main app directory.") +@click.option("--track", "-t", required=True, + help="MBID of the the low-level data from the AcousticBrainz API.") +@click.option("--logging", "-l", default="INFO", + type=click.Choice( + ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + case_sensitive=False + ), help="The logging level that will be printed") +def predict(project_file, export_path, track, logging): + """ Prediction of a track. """ + prediction( + exports_path=export_path, + project_file=project_file, + mbid=track, + log_level=logging + ) + + +if __name__ == '__main__': + cli() diff --git a/webserver/__init__.py b/webserver/__init__.py index 131f80519..1163e499b 100644 --- a/webserver/__init__.py +++ b/webserver/__init__.py @@ -8,7 +8,7 @@ import os import time -import urlparse +from six.moves import urllib API_PREFIX = '/api/' @@ -109,7 +109,7 @@ def after_request_callbacks(response): init_error_handlers(app) # Static files - import static_manager + from webserver import static_manager # Template utilities app.jinja_env.add_extension('jinja2.ext.do') @@ -136,7 +136,7 @@ def after_request_callbacks(response): def prod_https_login_redirect(): """ Redirect to HTTPS in production except for the API endpoints """ - if urlparse.urlsplit(request.url).scheme == 'http' \ + if urllib.parse.urlsplit(request.url).scheme == 'http' \ and app.config['DEBUG'] == False \ and app.config['TESTING'] == False \ and request.blueprint not in ('api', 'api_v1_core', 'api_v1_datasets', 'api_v1_dataset_eval'): diff --git a/webserver/forms.py b/webserver/forms.py index 0bddfd8c1..07b7d46e8 100644 --- a/webserver/forms.py +++ b/webserver/forms.py @@ -10,6 +10,9 @@ DATASET_EVAL_LOCAL = "local" DATASET_EVAL_REMOTE = "remote" +DATASET_TOOL_EVALUATION_GAIA = "gaia" +DATASET_TOOL_EVALUATION_SKLEARN = "sklearn" + DATASET_PENDING = "pending" DATASET_RUNNING = "running" DATASET_DONE = "done" @@ -60,6 +63,11 @@ class DatasetEvaluationForm(FlaskForm): render_kw={"data-toggle": "collapse", "data-target": "#collapseSvmOptions"}) + training_tool = SelectField("Model training tool", choices=[ + (DATASET_TOOL_EVALUATION_GAIA, "gaia"), + (DATASET_TOOL_EVALUATION_SKLEARN, "sklearn")], + default=DATASET_TOOL_EVALUATION_GAIA) + # C parameter to SVM c_value = StringField('C Values', default=DATASET_C_VALUE, render_kw={"data-default": DATASET_C_VALUE}) diff --git a/webserver/static/scripts/datasets/eval-jobs-viewer.js b/webserver/static/scripts/datasets/eval-jobs-viewer.js index 3e89c52bd..2932ee61a 100644 --- a/webserver/static/scripts/datasets/eval-jobs-viewer.js +++ b/webserver/static/scripts/datasets/eval-jobs-viewer.js @@ -191,6 +191,7 @@ class JobList extends React.Component { id={cls.id} created={cls.created} status={cls.status} + training_tool={cls.options.training_tool ?? "gaia"} outdated={cls.outdated} showDelete={this.props.showDelete} onViewDetails={this.props.onViewDetails} @@ -206,6 +207,7 @@ class JobList extends React.Component { Job ID Status Creation time + Training Tool @@ -227,6 +229,7 @@ class JobRow extends React.Component { id: PropTypes.string.isRequired, created: PropTypes.string.isRequired, status: PropTypes.string.isRequired, + training_tool: PropTypes.string.isRequired, outdated: PropTypes.string.isRequired, showDelete: PropTypes.bool.isRequired, onViewDetails: PropTypes.func.isRequired, @@ -283,6 +286,7 @@ class JobRow extends React.Component { {this.props.created} + {this.props.training_tool} {controls} ); diff --git a/webserver/templates/datasets/evaluate.html b/webserver/templates/datasets/evaluate.html index 177c20b05..bcaab9cde 100644 --- a/webserver/templates/datasets/evaluate.html +++ b/webserver/templates/datasets/evaluate.html @@ -56,6 +56,12 @@

Evaluate dataset "{{ dataset['name'] }}"

+ {% if config.get('FEATURE_EVAL_TOOL_SELECTION') %} +
+ +
{{ form.training_tool(class="form-control", required="required") }}
+
+ {% endif %}
@@ -85,8 +91,8 @@

Evaluate dataset "{{ dataset['name'] }}"

{{ form.preprocessing_values(required="required") }}
+ {% endif %} -
diff --git a/webserver/views/datasets.py b/webserver/views/datasets.py index 931734219..a85ebebda 100644 --- a/webserver/views/datasets.py +++ b/webserver/views/datasets.py @@ -15,7 +15,7 @@ import csv import math import six -import StringIO +from six import StringIO from webserver.views.api.exceptions import APIUnauthorized # Below values are defined in 'classification_project_template.yaml' file. @@ -127,7 +127,7 @@ def _convert_dataset_to_csv_stringio(dataset): # - dataset description, class names, class descriptions # TODO: On upgrade to python 3, check that stringio accepts the correct data # (may have to change to bytesio if we encode this data) - fp = StringIO.StringIO() + fp = StringIO() writer = csv.writer(fp) # write dataset description only if it is set @@ -253,6 +253,7 @@ def evaluate(dataset_id): gamma_values=gamma_values, preprocessing_values=preprocessing_values, filter_type=form.filter_type.data, + training_tool=form.training_tool.data ) flash.info("Dataset %s has been added into evaluation queue." % ds["id"]) except db.dataset_eval.IncompleteDatasetException as e: