From 2af30a4a4638ddd4ef948c83f9ccfb4050c74834 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Wed, 14 Dec 2022 19:30:33 +0000 Subject: [PATCH] Tony experiments (#29) * remove contract * remove contract * set n_jobs in ROCKET variants * threaded rocket * switch to threaded DrCIF * switch to threaded DrCIF * switch to threaded DrCIF * add temp DrCIF to test Parallel options * add temp DrCIF to test Parallel options * revert * set train file flag correctly * iterate DrCIF over 30 Tiselac resamples * Switch to DrCIF with FaceDetection * DrCIF with FaceDetection * DrCIF with InsectWingbeatEq * comment on regression_experiments.py * adjust set_regressor.py * switch to HC2 EigenWorms * switch to HC2 EigenWorms * switch to HC2 EigenWorms * tweak scripts * switch output dir creation --- .../classification_experiment.sh | 9 ++-- ada_uea_experiments/clustering_experiments.sh | 2 +- .../distance_clustering_experiments.sh | 3 +- ada_uea_experiments/regression_experiment.sh | 7 +-- tsml_eval/debug.py | 34 ++++++++++++ .../experiments/classification_experiments.py | 44 +++++++++------- tsml_eval/experiments/debug.py | 6 +++ .../distance_clustering_experiments.py | 52 +++++++++++-------- .../experiments/regression_experiments.py | 4 +- tsml_eval/experiments/set_classifier.py | 17 +++--- 10 files changed, 121 insertions(+), 57 deletions(-) create mode 100644 tsml_eval/debug.py create mode 100644 tsml_eval/experiments/debug.py diff --git a/ada_uea_experiments/classification_experiment.sh b/ada_uea_experiments/classification_experiment.sh index 733b5a17..29b3fb0d 100644 --- a/ada_uea_experiments/classification_experiment.sh +++ b/ada_uea_experiments/classification_experiment.sh @@ -11,7 +11,7 @@ max_folds=30 start_fold=1 # To avoid dumping 1000s of jobs in the queue we have a higher level queue -max_num_submitted=100 +max_num_submitted=500 # Queue options are https://my.uea.ac.uk/divisions/it-and-computing-services/service-catalogue/research-it-services/hpc/ada-cluster/using-ada queue="compute-64-512" @@ -42,11 +42,12 @@ results_dir=$local_path"ClassificationResults/sktime/" out_dir=$local_path"ClassificationResults/output/" # The python script we are running -script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiments/classification_experiments.py" +script_file_path=$local_path"Code/tsml-eval/tsml_eval/experiments +/classification_experiments.py" # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended -env_name="est-eval" +env_name="eval" # Generating train folds is usually slower, set to false unless you need them generate_train_files="false" @@ -93,7 +94,9 @@ do fi done + if [ "${array_jobs}" != "" ]; then +mkdir -p ${out_dir}${classifier}/${dataset}/ # This creates the scrip to run the job based on the info above echo "#!/bin/bash diff --git a/ada_uea_experiments/clustering_experiments.sh b/ada_uea_experiments/clustering_experiments.sh index 01b271cc..19711fee 100644 --- a/ada_uea_experiments/clustering_experiments.sh +++ b/ada_uea_experiments/clustering_experiments.sh @@ -46,7 +46,7 @@ script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiment # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended -env_name="est-eval" +env_name="eval" # todo this is currently only in for file skipping, should always be generating train files. need to rework clustering experiments more generate_train_files="true" diff --git a/ada_uea_experiments/distance_clustering_experiments.sh b/ada_uea_experiments/distance_clustering_experiments.sh index 710cd91b..6732f586 100644 --- a/ada_uea_experiments/distance_clustering_experiments.sh +++ b/ada_uea_experiments/distance_clustering_experiments.sh @@ -46,11 +46,12 @@ script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiment # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended -env_name="est-eval" +env_name="eval" generate_train_files="false" clusterer="kmeans" averaging="mean" +normalise="" count=0 # dtw ddtw erp edr wdtw lcss twe msm dwdtw euclidean diff --git a/ada_uea_experiments/regression_experiment.sh b/ada_uea_experiments/regression_experiment.sh index c7edeba1..091da210 100644 --- a/ada_uea_experiments/regression_experiment.sh +++ b/ada_uea_experiments/regression_experiment.sh @@ -11,7 +11,7 @@ max_folds=30 start_fold=1 # To avoid dumping 1000s of jobs in the queue we have a higher level queue -max_num_submitted=100 +max_num_submitted=500 # Queue options are https://my.uea.ac.uk/divisions/it-and-computing-services/service-catalogue/research-it-services/hpc/ada-cluster/using-ada queue="compute-64-512" @@ -42,11 +42,12 @@ results_dir=$local_path"RegressionResults/sktime/" out_dir=$local_path"RegressionResults/output/" # The python script we are running -script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiments/regression_experiments.py" +script_file_path=$local_path"Code/tsml-eval/tsml_eval/experiments/regression_experiments +.py" # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended -env_name="est-eval" +env_name="eval" # Generating train folds is usually slower, set to false unless you need them generate_train_files="false" diff --git a/tsml_eval/debug.py b/tsml_eval/debug.py new file mode 100644 index 00000000..5d891283 --- /dev/null +++ b/tsml_eval/debug.py @@ -0,0 +1,34 @@ +"""Hacky area to test shit out""" +import time +from sktime.datasets import load_from_tsfile +from sktime.utils.sampling import stratified_resample +from sktime.distances import dtw_distance +import numpy as np +instance1 = np.array([[1,2,3,4], [4,3,2,1]]) +instance2 = np.array([[2,3,4,5], [5,4,3,2]]) +print(" shape is [n_dimensions, series_length] = ", instance1.shape) +print(" DTW_D is = ", dtw_distance(instance1, instance2)) + + +def time_data_load(): + dataset = ["InsectWingbeatEq"] + for file in dataset: + start = time.time() + x, y = load_from_tsfile(f"C:/Data/{file}/{file}_TRAIN.ts") + x2, y2 = load_from_tsfile(f"C:/Data/{file}/{file}_TEST.ts") + end = time.time() + print(f" Load pandas for problem {file} time taken = {end-start}") + start = time.time() + x, y, x2, y2 = stratified_resample(x, y, x2, y2, 1) + end = time.time() + print(f" resample time problem {file} time taken = {end-start}") +# start = time.time() +# x, y = load_from_tsfile(f"C:/Data/{file}/{file}_TRAIN.ts", +# return_data_type="numpy3d") +# x2, y2 = load_from_tsfile(f"C:/Data/{file}/{file}_TEST.ts", +# return_data_type="numpy3d") +# end = time.time() +# print(f" Load numpy for problem {file} time taken = {end-start}") + + +time_data_load() \ No newline at end of file diff --git a/tsml_eval/experiments/classification_experiments.py b/tsml_eval/experiments/classification_experiments.py index a595111d..f0429921 100644 --- a/tsml_eval/experiments/classification_experiments.py +++ b/tsml_eval/experiments/classification_experiments.py @@ -75,27 +75,33 @@ def run_experiment(args, overwrite=False): overwrite=overwrite, ) else: # Local run - data_dir = "../" - results_dir = "../" - cls_name = "DrCIF" - dataset = "ItalyPowerDemand" - resample = 0 + data_dir = "/home/ajb/Data/" + results_dir = "/home/ajb/Results Working Area/ReduxBakeoff/sktime/" + cls_name = "HC2" + n_jobs = 92 + contract_mins = 0 + dataset = "EigenWorms" + print(f" Local Run of {cls_name} on dataset {dataset} with threading jobs " + f"={ n_jobs} and " + f"contract time ={contract_mins}") train_fold = False predefined_resample = False - classifier = set_classifier(cls_name, resample, train_fold) - print(f"Local Run of {classifier.__class__.__name__}.") - - load_and_run_classification_experiment( - overwrite=False, - problem_path=data_dir, - results_path=results_dir, - cls_name=cls_name, - classifier=classifier, - dataset=dataset, - resample_id=resample, - build_train=train_fold, - predefined_resample=predefined_resample, - ) + for resample in range(0, 30): + classifier = set_classifier(cls_name, resample_id=resample, n_jobs=n_jobs, + contract=contract_mins, train_file=train_fold) + print(f"Local Run of {classifier.__class__.__name__} with {classifier.n_jobs} jobs") + + load_and_run_classification_experiment( + overwrite=False, + problem_path=data_dir, + results_path=results_dir, + cls_name=cls_name, + classifier=classifier, + dataset=dataset, + resample_id=resample, + build_train=train_fold, + predefined_resample=predefined_resample, + ) if __name__ == "__main__": diff --git a/tsml_eval/experiments/debug.py b/tsml_eval/experiments/debug.py new file mode 100644 index 00000000..a7b5ca4a --- /dev/null +++ b/tsml_eval/experiments/debug.py @@ -0,0 +1,6 @@ + +tf = bool("false") +tf2 = bool("False") +tf3 = bool("true") +tf4 = bool("True") +print(f" {tf} {tf2} {tf3} {tf4}") diff --git a/tsml_eval/experiments/distance_clustering_experiments.py b/tsml_eval/experiments/distance_clustering_experiments.py index 2e7020b1..7249316b 100644 --- a/tsml_eval/experiments/distance_clustering_experiments.py +++ b/tsml_eval/experiments/distance_clustering_experiments.py @@ -86,21 +86,33 @@ def _recreate_results(trainX, trainY): clusterer = "kmeans" chris_config = True # This is so chris doesn't have to change config each time tune = False - if sys.argv.__len__() > 1: # cluster run, this is fragile + normalise = True + if sys.argv.__len__() > 1: # cluster run, this is fragile, requires all args atm data_dir = sys.argv[1] results_dir = sys.argv[2] distance = sys.argv[3] dataset = sys.argv[4] - resample = int(sys.argv[5]) - 1 - tf = bool(sys.argv[6]) - clusterer = sys.argv[7] - averaging = sys.argv[8] + # ADA starts indexing its jobs at 1, so we need to subtract 1 + resample = int(args[5]) - 1 + clusterer = sys.argv[6] + if len(args) > 7: + train_fold = args[7].lower() == "true" + else: + train_fold = False + if len(args) > 8: + averaging = args[8] + else: + averaging = "mean" + if len(args) > 9: + normalise = args[9].lower() == "true" + else: + normalise = False if averaging == "dba": results_dir = results_dir + clusterer + "_dba" - if results_present(results_dir, clusterer, dataset, resample): print("Ignoring, results already present") + elif chris_config is True: path = "C:/Users/chris/Documents/Masters" data_dir = os.path.abspath(f"{path}/datasets/Multivariate_ts/") @@ -108,7 +120,7 @@ def _recreate_results(trainX, trainY): dataset = "Handwriting" resample = 2 averaging = "mean" - tf = True + train_fold = True distance = "dtw" else: # Local run @@ -118,7 +130,7 @@ def _recreate_results(trainX, trainY): results_dir = "./temp" resample = 0 averaging = "dba" - tf = True + train_fold = True distance = "dtw" if isinstance(dataset, str): @@ -137,26 +149,22 @@ def _recreate_results(trainX, trainY): # import sys from sklearn.preprocessing import StandardScaler - - s = StandardScaler() - train_X = s.fit_transform(train_X.T) - train_X = train_X.T - test_X = s.fit_transform(test_X.T) - test_X = test_X.T + if normalise: + s = StandardScaler() + train_X = s.fit_transform(train_X.T) + train_X = train_X.T + test_X = s.fit_transform(test_X.T) + test_X = test_X.T w = 1.0 if tune: w = tune_window(distance, train_X, len(set(train_Y))) name = clusterer + "-" + distance + "-tuned" else: name = clusterer + "-" + distance - # w = 1.0 - # if ( - # distance == "wdtw" - # or distance == "dwdtw" - # or distance == "dtw" - # or distance == "wdtw" - # ): - # w = 0.2 + w = 1.0 + if (distance == "wdtw" or distance == "dwdtw" or distance == "dtw" or distance == + "wdtw"): + w = 0.2 parameters = { "window": w, "epsilon": 0.05, diff --git a/tsml_eval/experiments/regression_experiments.py b/tsml_eval/experiments/regression_experiments.py index b71a86fb..c17539f6 100644 --- a/tsml_eval/experiments/regression_experiments.py +++ b/tsml_eval/experiments/regression_experiments.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- -"""Classifier Experiments: code to run experiments as an alternative to orchestration. +"""Regressor Experiments: code to run experiments and generate results file in +standard format. This file is configured for runs of the main method with command line arguments, or for single debugging runs. Results are written in a standard format. It is cloned from @@ -10,6 +11,7 @@ import os +# Remove if not running on cluster? os.environ["MKL_NUM_THREADS"] = "1" # must be done before numpy import!! os.environ["NUMEXPR_NUM_THREADS"] = "1" # must be done before numpy import!! os.environ["OMP_NUM_THREADS"] = "1" # must be done before numpy import!! diff --git a/tsml_eval/experiments/set_classifier.py b/tsml_eval/experiments/set_classifier.py index b541c018..f7413db0 100644 --- a/tsml_eval/experiments/set_classifier.py +++ b/tsml_eval/experiments/set_classifier.py @@ -202,8 +202,7 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0 return HIVECOTEV1(random_state=resample_id) elif name == "hc2" or name == "hivecotev2": from sktime.classification.hybrid import HIVECOTEV2 - - return HIVECOTEV2(random_state=resample_id) + return HIVECOTEV2(random_state=resample_id, n_jobs=n_jobs) # Interval based elif name == "rise-500": from sktime.classification.interval_based import RandomIntervalSpectralEnsemble @@ -251,7 +250,6 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0 return SupervisedTimeSeriesForest(random_state=resample_id, n_jobs=n_jobs) elif name == "drcif-500": from sktime.classification.interval_based import DrCIF - return DrCIF( random_state=resample_id, n_estimators=500, @@ -268,21 +266,24 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0 elif name == "rocket" or name == "rocketclassifier": from sktime.classification.kernel_based import RocketClassifier - return RocketClassifier(random_state=resample_id) + return RocketClassifier(random_state=resample_id, n_jobs=n_jobs) elif name == "mini-rocket": from sktime.classification.kernel_based import RocketClassifier - return RocketClassifier(random_state=resample_id, rocket_transform="minirocket") + return RocketClassifier(random_state=resample_id, + rocket_transform="minirocket", n_jobs=n_jobs) elif name == "multi-rocket": from sktime.classification.kernel_based import RocketClassifier return RocketClassifier( - random_state=resample_id, rocket_transform="multirocket" + random_state=resample_id, rocket_transform="multirocket", n_jobs=n_jobs, + ) elif name == "arsenal": from sktime.classification.kernel_based import Arsenal - return Arsenal(random_state=resample_id, save_transformed_data=train_file) + return Arsenal(random_state=resample_id, save_transformed_data=train_file, + n_jobs=n_jobs) elif name == "mini-arsenal": from sktime.classification.kernel_based import Arsenal @@ -290,6 +291,7 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0 random_state=resample_id, save_transformed_data=train_file, rocket_transform="minirocket", + n_jobs=n_jobs, ) elif name == "multi-arsenal": from sktime.classification.kernel_based import Arsenal @@ -298,6 +300,7 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0 random_state=resample_id, save_transformed_data=train_file, rocket_transform="multirocket", + n_jobs=n_jobs, ) elif name == "hydra": from tsml_eval.sktime_estimators.classification.hydra import HYDRA