Tony experiments (#29)

* remove contract * remove contract * set n_jobs in ROCKET variants * threaded rocket * switch to threaded DrCIF * switch to threaded DrCIF * switch to threaded DrCIF * add temp DrCIF to test Parallel options * add temp DrCIF to test Parallel options * revert * set train file flag correctly * iterate DrCIF over 30 Tiselac resamples * Switch to DrCIF with FaceDetection * DrCIF with FaceDetection * DrCIF with InsectWingbeatEq * comment on regression_experiments.py * adjust set_regressor.py * switch to HC2 EigenWorms * switch to HC2 EigenWorms * switch to HC2 EigenWorms * tweak scripts * switch output dir creation
time-series-machine-learning · Dec 14, 2022 · 2af30a4 · 2af30a4
1 parent 16f9bf3
commit 2af30a4
Show file tree

Hide file tree

Showing 10 changed files with 121 additions and 57 deletions.
diff --git a/ada_uea_experiments/classification_experiment.sh b/ada_uea_experiments/classification_experiment.sh
@@ -11,7 +11,7 @@ max_folds=30
 start_fold=1
 
 # To avoid dumping 1000s of jobs in the queue we have a higher level queue
-max_num_submitted=100
+max_num_submitted=500
 
 # Queue options are https://my.uea.ac.uk/divisions/it-and-computing-services/service-catalogue/research-it-services/hpc/ada-cluster/using-ada
 queue="compute-64-512"
@@ -42,11 +42,12 @@ results_dir=$local_path"ClassificationResults/sktime/"
 out_dir=$local_path"ClassificationResults/output/"
 
 # The python script we are running
-script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiments/classification_experiments.py"
+script_file_path=$local_path"Code/tsml-eval/tsml_eval/experiments
+/classification_experiments.py"
 
 # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ
 # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended
-env_name="est-eval"
+env_name="eval"
 
 # Generating train folds is usually slower, set to false unless you need them
 generate_train_files="false"
@@ -93,7 +94,9 @@ do
     fi
 done
 
+
 if [ "${array_jobs}" != "" ]; then
+mkdir -p ${out_dir}${classifier}/${dataset}/
 
 # This creates the scrip to run the job based on the info above
 echo "#!/bin/bash

diff --git a/ada_uea_experiments/clustering_experiments.sh b/ada_uea_experiments/clustering_experiments.sh
@@ -46,7 +46,7 @@ script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiment
 
 # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ
 # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended
-env_name="est-eval"
+env_name="eval"
 
 # todo this is currently only in for file skipping, should always be generating train files. need to rework clustering experiments more
 generate_train_files="true"

diff --git a/ada_uea_experiments/distance_clustering_experiments.sh b/ada_uea_experiments/distance_clustering_experiments.sh
@@ -46,11 +46,12 @@ script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiment
 
 # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ
 # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended
-env_name="est-eval"
+env_name="eval"
 
 generate_train_files="false"
 clusterer="kmeans"
 averaging="mean"
+normalise=""
 
 count=0
 # dtw ddtw erp edr wdtw lcss twe msm dwdtw euclidean

diff --git a/ada_uea_experiments/regression_experiment.sh b/ada_uea_experiments/regression_experiment.sh
@@ -11,7 +11,7 @@ max_folds=30
 start_fold=1
 
 # To avoid dumping 1000s of jobs in the queue we have a higher level queue
-max_num_submitted=100
+max_num_submitted=500
 
 # Queue options are https://my.uea.ac.uk/divisions/it-and-computing-services/service-catalogue/research-it-services/hpc/ada-cluster/using-ada
 queue="compute-64-512"
@@ -42,11 +42,12 @@ results_dir=$local_path"RegressionResults/sktime/"
 out_dir=$local_path"RegressionResults/output/"
 
 # The python script we are running
-script_file_path=$local_path"Code/tsml-estimator-evaluation/tsml_eval/experiments/regression_experiments.py"
+script_file_path=$local_path"Code/tsml-eval/tsml_eval/experiments/regression_experiments
+.py"
 
 # Environment name, change accordingly, for set up, see https://hackmd.io/ds5IEK3oQAquD4c6AP2xzQ
 # Separate environments for GPU (default python/anaconda/2020.11/3.8) and CPU (default python/anaconda/2019.10/3.7) are recommended
-env_name="est-eval"
+env_name="eval"
 
 # Generating train folds is usually slower, set to false unless you need them
 generate_train_files="false"

diff --git a/tsml_eval/debug.py b/tsml_eval/debug.py
@@ -0,0 +1,34 @@
+"""Hacky area to test shit out"""
+import time
+from sktime.datasets import load_from_tsfile
+from sktime.utils.sampling import stratified_resample
+from sktime.distances import dtw_distance
+import numpy as np
+instance1 = np.array([[1,2,3,4], [4,3,2,1]])
+instance2 = np.array([[2,3,4,5], [5,4,3,2]])
+print(" shape is [n_dimensions, series_length] = ", instance1.shape)
+print(" DTW_D is = ", dtw_distance(instance1, instance2))
+
+
+def time_data_load():
+    dataset = ["InsectWingbeatEq"]
+    for file in dataset:
+        start = time.time()
+        x, y = load_from_tsfile(f"C:/Data/{file}/{file}_TRAIN.ts")
+        x2, y2 = load_from_tsfile(f"C:/Data/{file}/{file}_TEST.ts")
+        end = time.time()
+        print(f" Load pandas for problem {file} time taken = {end-start}")
+        start = time.time()
+        x, y, x2, y2 = stratified_resample(x, y, x2, y2, 1)
+        end = time.time()
+        print(f" resample time problem  {file} time taken = {end-start}")
+#        start = time.time()
+#        x, y = load_from_tsfile(f"C:/Data/{file}/{file}_TRAIN.ts",
+#                                return_data_type="numpy3d")
+#        x2, y2 = load_from_tsfile(f"C:/Data/{file}/{file}_TEST.ts",
+#                                return_data_type="numpy3d")
+#        end = time.time()
+#        print(f" Load numpy for problem  {file} time taken = {end-start}")
+
+
+time_data_load()
diff --git a/tsml_eval/experiments/classification_experiments.py b/tsml_eval/experiments/classification_experiments.py
@@ -75,27 +75,33 @@ def run_experiment(args, overwrite=False):
                 overwrite=overwrite,
             )
     else:  # Local run
-        data_dir = "../"
-        results_dir = "../"
-        cls_name = "DrCIF"
-        dataset = "ItalyPowerDemand"
-        resample = 0
+        data_dir = "/home/ajb/Data/"
+        results_dir = "/home/ajb/Results Working Area/ReduxBakeoff/sktime/"
+        cls_name = "HC2"
+        n_jobs = 92
+        contract_mins = 0
+        dataset = "EigenWorms"
+        print(f" Local Run of {cls_name} on dataset {dataset} with threading jobs "
+              f"={ n_jobs} and "
+              f"contract time ={contract_mins}")
         train_fold = False
         predefined_resample = False
-        classifier = set_classifier(cls_name, resample, train_fold)
-        print(f"Local Run of {classifier.__class__.__name__}.")
-
-        load_and_run_classification_experiment(
-            overwrite=False,
-            problem_path=data_dir,
-            results_path=results_dir,
-            cls_name=cls_name,
-            classifier=classifier,
-            dataset=dataset,
-            resample_id=resample,
-            build_train=train_fold,
-            predefined_resample=predefined_resample,
-        )
+        for resample in range(0, 30):
+            classifier = set_classifier(cls_name, resample_id=resample, n_jobs=n_jobs,
+                                        contract=contract_mins, train_file=train_fold)
+            print(f"Local Run of {classifier.__class__.__name__} with {classifier.n_jobs} jobs")
+
+            load_and_run_classification_experiment(
+                overwrite=False,
+                problem_path=data_dir,
+                results_path=results_dir,
+                cls_name=cls_name,
+                classifier=classifier,
+                dataset=dataset,
+                resample_id=resample,
+                build_train=train_fold,
+                predefined_resample=predefined_resample,
+            )
 
 
 if __name__ == "__main__":

diff --git a/tsml_eval/experiments/debug.py b/tsml_eval/experiments/debug.py
@@ -0,0 +1,6 @@
+
+tf = bool("false")
+tf2 = bool("False")
+tf3 = bool("true")
+tf4 = bool("True")
+print(f" {tf}  {tf2}  {tf3}  {tf4}")
diff --git a/tsml_eval/experiments/distance_clustering_experiments.py b/tsml_eval/experiments/distance_clustering_experiments.py
@@ -86,29 +86,41 @@ def _recreate_results(trainX, trainY):
     clusterer = "kmeans"
     chris_config = True  # This is so chris doesn't have to change config each time
     tune = False
-    if sys.argv.__len__() > 1:  # cluster run, this is fragile
+    normalise = True
+    if sys.argv.__len__() > 1:  # cluster run, this is fragile, requires all args atm
         data_dir = sys.argv[1]
         results_dir = sys.argv[2]
         distance = sys.argv[3]
         dataset = sys.argv[4]
-        resample = int(sys.argv[5]) - 1
-        tf = bool(sys.argv[6])
-        clusterer = sys.argv[7]
-        averaging = sys.argv[8]
+        # ADA starts indexing its jobs at 1, so we need to subtract 1
+        resample = int(args[5]) - 1
+        clusterer = sys.argv[6]
+        if len(args) > 7:
+            train_fold = args[7].lower() == "true"
+        else:
+            train_fold = False
+        if len(args) > 8:
+            averaging = args[8]
+        else:
+            averaging = "mean"
+        if len(args) > 9:
+            normalise = args[9].lower() == "true"
+        else:
+            normalise = False
         if averaging == "dba":
             results_dir = results_dir + clusterer + "_dba"
-
         if results_present(results_dir, clusterer, dataset, resample):
             print("Ignoring, results already present")
 
+
     elif chris_config is True:
         path = "C:/Users/chris/Documents/Masters"
         data_dir = os.path.abspath(f"{path}/datasets/Multivariate_ts/")
         results_dir = os.path.abspath(f"{path}/results/")
         dataset = "Handwriting"
         resample = 2
         averaging = "mean"
-        tf = True
+        train_fold = True
         distance = "dtw"
 
     else:  # Local run
@@ -118,7 +130,7 @@ def _recreate_results(trainX, trainY):
         results_dir = "./temp"
         resample = 0
         averaging = "dba"
-        tf = True
+        train_fold = True
         distance = "dtw"
 
     if isinstance(dataset, str):
@@ -137,26 +149,22 @@ def _recreate_results(trainX, trainY):
     #    import sys
 
     from sklearn.preprocessing import StandardScaler
-
-    s = StandardScaler()
-    train_X = s.fit_transform(train_X.T)
-    train_X = train_X.T
-    test_X = s.fit_transform(test_X.T)
-    test_X = test_X.T
+    if normalise:
+        s = StandardScaler()
+        train_X = s.fit_transform(train_X.T)
+        train_X = train_X.T
+        test_X = s.fit_transform(test_X.T)
+        test_X = test_X.T
     w = 1.0
     if tune:
         w = tune_window(distance, train_X, len(set(train_Y)))
         name = clusterer + "-" + distance + "-tuned"
     else:
         name = clusterer + "-" + distance
-    #     w = 1.0
-    #     if (
-    #         distance == "wdtw"
-    #         or distance == "dwdtw"
-    #         or distance == "dtw"
-    #         or distance == "wdtw"
-    #     ):
-    #         w = 0.2
+    w = 1.0
+    if (distance == "wdtw" or distance == "dwdtw" or distance == "dtw" or distance ==
+    "wdtw"):
+        w = 0.2
     parameters = {
         "window": w,
         "epsilon": 0.05,

diff --git a/tsml_eval/experiments/regression_experiments.py b/tsml_eval/experiments/regression_experiments.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
-"""Classifier Experiments: code to run experiments as an alternative to orchestration.
+"""Regressor Experiments: code to run experiments and generate results file in
+standard format.
 
 This file is configured for runs of the main method with command line arguments, or for
 single debugging runs. Results are written in a standard format. It is cloned from
@@ -10,6 +11,7 @@
 
 import os
 
+# Remove if not running on cluster?
 os.environ["MKL_NUM_THREADS"] = "1"  # must be done before numpy import!!
 os.environ["NUMEXPR_NUM_THREADS"] = "1"  # must be done before numpy import!!
 os.environ["OMP_NUM_THREADS"] = "1"  # must be done before numpy import!!

diff --git a/tsml_eval/experiments/set_classifier.py b/tsml_eval/experiments/set_classifier.py
@@ -202,8 +202,7 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0
         return HIVECOTEV1(random_state=resample_id)
     elif name == "hc2" or name == "hivecotev2":
         from sktime.classification.hybrid import HIVECOTEV2
-
-        return HIVECOTEV2(random_state=resample_id)
+        return HIVECOTEV2(random_state=resample_id, n_jobs=n_jobs)
     # Interval based
     elif name == "rise-500":
         from sktime.classification.interval_based import RandomIntervalSpectralEnsemble
@@ -251,7 +250,6 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0
         return SupervisedTimeSeriesForest(random_state=resample_id, n_jobs=n_jobs)
     elif name == "drcif-500":
         from sktime.classification.interval_based import DrCIF
-
         return DrCIF(
             random_state=resample_id,
             n_estimators=500,
@@ -268,28 +266,32 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0
     elif name == "rocket" or name == "rocketclassifier":
         from sktime.classification.kernel_based import RocketClassifier
 
-        return RocketClassifier(random_state=resample_id)
+        return RocketClassifier(random_state=resample_id, n_jobs=n_jobs)
     elif name == "mini-rocket":
         from sktime.classification.kernel_based import RocketClassifier
 
-        return RocketClassifier(random_state=resample_id, rocket_transform="minirocket")
+        return RocketClassifier(random_state=resample_id,
+                                rocket_transform="minirocket", n_jobs=n_jobs)
     elif name == "multi-rocket":
         from sktime.classification.kernel_based import RocketClassifier
 
         return RocketClassifier(
-            random_state=resample_id, rocket_transform="multirocket"
+            random_state=resample_id, rocket_transform="multirocket", n_jobs=n_jobs,
+
         )
     elif name == "arsenal":
         from sktime.classification.kernel_based import Arsenal
 
-        return Arsenal(random_state=resample_id, save_transformed_data=train_file)
+        return Arsenal(random_state=resample_id, save_transformed_data=train_file,
+                       n_jobs=n_jobs)
     elif name == "mini-arsenal":
         from sktime.classification.kernel_based import Arsenal
 
         return Arsenal(
             random_state=resample_id,
             save_transformed_data=train_file,
             rocket_transform="minirocket",
+            n_jobs=n_jobs,
         )
     elif name == "multi-arsenal":
         from sktime.classification.kernel_based import Arsenal
@@ -298,6 +300,7 @@ def set_classifier(cls, resample_id=None, train_file=False, n_jobs=1, contract=0
             random_state=resample_id,
             save_transformed_data=train_file,
             rocket_transform="multirocket",
+            n_jobs=n_jobs,
         )
     elif name == "hydra":
         from tsml_eval.sktime_estimators.classification.hydra import HYDRA