From 0ff920961a313188e90839fff9ebfa898c64ab66 Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Mon, 15 Jan 2024 17:39:38 +0100 Subject: [PATCH 1/6] retraining with multiple labels is possible. multiple labels can be specified by using their names separated by commas in the folder names --- gui.py | 9 ++++++++- train.py | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/gui.py b/gui.py index 0a243b2a..abb8d95a 100644 --- a/gui.py +++ b/gui.py @@ -396,7 +396,14 @@ def select_subdirectories(): if dir_name: subdirs = utils.list_subdirectories(dir_name[0]) - return dir_name[0], [[d] for d in subdirs] + labels = [] + for folder in subdirs: + labels_in_folder = folder.split(',') + for label in labels_in_folder: + if not label in labels and not label.startswith('-') and not label in cfg.NON_EVENT_CLASSES: + labels.append(label) + + return dir_name[0], [[label] for label in sorted(labels)] return None, None diff --git a/train.py b/train.py index f400c1e9..5aa80eab 100644 --- a/train.py +++ b/train.py @@ -38,42 +38,59 @@ def _loadTrainingData(cache_mode="none", cache_file=""): print(f"\t...cache file not found: {cache_file}", flush=True) # Get list of subfolders as labels - labels = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH))) + folders = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH))) + + # Read all individual labels from the folder names + labels = [] + + for folder in folders: + labels_in_folder = folder.split(',') + for label in labels_in_folder: + if not label in labels: + labels.append(label) + + # Sort labels + labels = list(sorted(labels)) # Get valid labels valid_labels = [l for l in labels if not l.lower() in cfg.NON_EVENT_CLASSES and not l.startswith("-")] cfg.BINARY_CLASSIFICATION = len(valid_labels) == 1 + # Validate the classes for binary classification if cfg.BINARY_CLASSIFICATION: - if len([l for l in labels if l.startswith("-")]) > 0: + if len([l for l in folders if l.startswith("-")]) > 0: raise Exception("negative labels cant be used with binary classification") - if len([l for l in labels if l in cfg.NON_EVENT_CLASSES]) == 0: + if len([l for l in folders if l in cfg.NON_EVENT_CLASSES]) == 0: raise Exception("non-event samples are required for binary classification") # Load training data x_train = [] y_train = [] - for label in labels: + for folder in folders: - # Current label - print(f"\t- {label}", flush=True) + # Current folder + print(f"\t- {folder}", flush=True) # Get label vector label_vector = np.zeros((len(valid_labels),), dtype="float32") - if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"): - label_vector[valid_labels.index(label)] = 1 - elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels - label_vector[valid_labels.index(label[1:])] = -1 + + folder_labels = folder.split(',') + + for label in folder_labels: + if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"): + label_vector[valid_labels.index(label)] = 1 + elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels + label_vector[valid_labels.index(label[1:])] = -1 # Get list of files # Filter files that start with '.' because macOS seems to them for temp files. files = filter( os.path.isfile, ( - os.path.join(cfg.TRAIN_DATA_PATH, label, f) - for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, label))) + os.path.join(cfg.TRAIN_DATA_PATH, folder, f) + for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, folder))) if not f.startswith(".") and f.rsplit(".", 1)[-1].lower() in cfg.ALLOWED_FILETYPES ), ) @@ -87,7 +104,7 @@ def _loadTrainingData(cache_mode="none", cache_file=""): # if anything happens print the error and ignore the file except Exception as e: - # Current label + # Print Error print(f"\t Error when loading file {f}", flush=True) continue From 594abdd6a4f82d78b648b56cbce10c62da9e2fd3 Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Tue, 16 Jan 2024 15:52:13 +0100 Subject: [PATCH 2/6] multilabel is set in config, only repeat-upsampling allowed for multilabel setting --- config.py | 7 ++++++- model.py | 2 +- train.py | 24 +++++++++++++++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/config.py b/config.py index c963a4d6..c911b3f6 100644 --- a/config.py +++ b/config.py @@ -166,9 +166,14 @@ # Mutliple executions will be averaged, so the evaluation is more consistent AUTOTUNE_EXECUTIONS_PER_TRIAL: int = 1 -# If a binary classification model is trained, this value will be detected automatically in the training script +# If a binary classification model is trained. +# This value will be detected automatically in the training script, if only one class and a non-event class is used. BINARY_CLASSIFICATION: bool = False +# If a model for a multi-label setting is trained. +# This value will automatically be set, if subfolders in the input direcotry are named with multiple classes separated by commas. +MULTI_LABEL: bool = False + ##################### # Misc runtime vars # ##################### diff --git a/model.py b/model.py index ccb4f35a..b31cb272 100644 --- a/model.py +++ b/model.py @@ -252,7 +252,7 @@ def on_epoch_end(self, epoch, logs=None): classifier.compile( optimizer=keras.optimizers.Adam(learning_rate=lr_schedule), loss=custom_loss, - metrics=[keras.metrics.AUC(curve="PR", multi_label=False, name="AUPRC"), keras.metrics.AUC(curve="ROC", multi_label=False, name="AUROC")], + metrics=[keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC"), keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC")], ) # Train model diff --git a/train.py b/train.py index 5aa80eab..12730d0a 100644 --- a/train.py +++ b/train.py @@ -40,6 +40,7 @@ def _loadTrainingData(cache_mode="none", cache_file=""): # Get list of subfolders as labels folders = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH))) + # Read all individual labels from the folder names labels = [] @@ -55,14 +56,26 @@ def _loadTrainingData(cache_mode="none", cache_file=""): # Get valid labels valid_labels = [l for l in labels if not l.lower() in cfg.NON_EVENT_CLASSES and not l.startswith("-")] + # Check if binary classification cfg.BINARY_CLASSIFICATION = len(valid_labels) == 1 # Validate the classes for binary classification if cfg.BINARY_CLASSIFICATION: if len([l for l in folders if l.startswith("-")]) > 0: - raise Exception("negative labels cant be used with binary classification") + raise Exception("Negative labels cant be used with binary classification") if len([l for l in folders if l in cfg.NON_EVENT_CLASSES]) == 0: - raise Exception("non-event samples are required for binary classification") + raise Exception("Non-event samples are required for binary classification") + + # Check if multi label + cfg.MULTI_LABEL = len(valid_labels) > 1 and any(',' in f for f in folders) + + # Check if multi-label and binary classficication + if cfg.BINARY_CLASSIFICATION and cfg.MULTI_LABEL: + raise Exception("Error: Binary classfication and multi-label not possible at the same time") + + # Only allow repeat upsampling for multi-label setting + if cfg.MULTI_LABEL and cfg.UPSAMPLING_RATIO > 0 and cfg.UPSAMPLING_MODE != 'repeat': + raise Exception("Only repeat-upsampling ist available for multi-label") # Load training data x_train = [] @@ -178,6 +191,11 @@ def run_trial(self, trial, *args, **kwargs): dropout=hp.Choice("dropout", [0.0, 0.25, 0.33, 0.5, 0.75, 0.9], default=cfg.TRAIN_DROPOUT)) print("...Done.", flush=True) + # Only allow repeat upsampling in multi-label setting + upsampling_choices = ['repeat', 'mean', 'linear'] #SMOTE is too slow + if cfg.MULTI_LABEL: + upsampling_choices = ['repeat'] + # Train model print("Training model...", flush=True) classifier, history = model.trainLinearClassifier( @@ -189,7 +207,7 @@ def run_trial(self, trial, *args, **kwargs): learning_rate=hp.Choice("learning_rate", [0.1, 0.01, 0.005, 0.002, 0.001, 0.0005, 0.0002, 0.0001], default=cfg.TRAIN_LEARNING_RATE), val_split=cfg.TRAIN_VAL_SPLIT, upsampling_ratio=hp.Choice("upsampling_ratio",[0.0, 0.25, 0.33, 0.5, 0.75, 1.0], default=cfg.UPSAMPLING_RATIO), - upsampling_mode=hp.Choice("upsampling_mode", ['repeat', 'mean', 'linear'], default=cfg.UPSAMPLING_MODE), #SMOTE is too slow + upsampling_mode=hp.Choice("upsampling_mode", upsampling_choices, default=cfg.UPSAMPLING_MODE), train_with_mixup=hp.Boolean("mixup", default=cfg.TRAIN_WITH_MIXUP), train_with_label_smoothing=hp.Boolean("label_smoothing", default=cfg.TRAIN_WITH_LABEL_SMOOTHING), ) From eccbbe6c8e630e92b71c063ed764aff97d72013b Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Tue, 16 Jan 2024 18:07:23 +0100 Subject: [PATCH 3/6] reworked datasplit for multi label, multilabel/binary config gets saved in cache file --- model.py | 6 ++++- train.py | 2 +- utils.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/model.py b/model.py index b31cb272..939e1589 100644 --- a/model.py +++ b/model.py @@ -214,7 +214,11 @@ def on_epoch_end(self, epoch, logs=None): y_train = y_train[idx] # Random val split - x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split) + if not cfg.MULTI_LABEL: + x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split) + else: + x_train, y_train, x_val, y_val = utils.random_multilabel_split(x_train, y_train, val_split) + print( f"Training on {x_train.shape[0]} samples, validating on {x_val.shape[0]} samples.", flush=True, diff --git a/train.py b/train.py index 12730d0a..3755bb6d 100644 --- a/train.py +++ b/train.py @@ -32,7 +32,7 @@ def _loadTrainingData(cache_mode="none", cache_file=""): if cache_mode == "load": if os.path.isfile(cache_file): print(f"\t...loading from cache: {cache_file}", flush=True) - x_train, y_train, labels = utils.loadFromCache(cache_file) + x_train, y_train, labels, cfg.BINARY_CLASSIFICATION, cfg.MULTI_LABEL = utils.loadFromCache(cache_file) return x_train, y_train, labels else: print(f"\t...cache file not found: {cache_file}", flush=True) diff --git a/utils.py b/utils.py index 8eb4e289..6b1e9034 100644 --- a/utils.py +++ b/utils.py @@ -56,6 +56,73 @@ def list_subdirectories(path: str): """ return filter(lambda el: os.path.isdir(os.path.join(path, el)), os.listdir(path)) +def random_multilabel_split(x, y, val_ratio=0.2): + """Splits the data into training and validation data. + + Makes sure that each combination of classes is represented in both sets. + + Args: + x: Samples. + y: One-hot labels. + val_ratio: The ratio of validation data. + + Returns: + A tuple of (x_train, y_train, x_val, y_val). + + """ + + # Set numpy random seed + np.random.seed(cfg.RANDOM_SEED) + + # Find all combinations of labels + class_combinations = np.unique(y, axis=0) + + # Initialize training and validation data + x_train, y_train, x_val, y_val = [], [], [], [] + + # Split the data for each combination of labels + for class_combination in class_combinations: + # find all indices + indices = np.where((y == class_combination).all(axis=1))[0] + + # When negative sample use only for training + if -1 in class_combination: + x_train.append(x[indices]) + y_train.append(y[indices]) + # Otherwise split according to the validation split + else: + # Get number of samples for each set + num_samples = len(indices) + num_samples_train = max(1, int(num_samples * (1 - val_ratio))) + num_samples_val = max(0, num_samples - num_samples_train) + # Randomly choose samples for training and validation + np.random.shuffle(indices) + train_indices = indices[:num_samples_train] + val_indices = indices[num_samples_train:num_samples_train + num_samples_val] + # Append samples to training and validation data + x_train.append(x[train_indices]) + y_train.append(y[train_indices]) + x_val.append(x[val_indices]) + y_val.append(y[val_indices]) + + # Concatenate data + x_train = np.concatenate(x_train) + y_train = np.concatenate(y_train) + x_val = np.concatenate(x_val) + y_val = np.concatenate(y_val) + + # Shuffle data + indices = np.arange(len(x_train)) + np.random.shuffle(indices) + x_train = x_train[indices] + y_train = y_train[indices] + + indices = np.arange(len(x_val)) + np.random.shuffle(indices) + x_val = x_val[indices] + y_val = y_val[indices] + + return x_train, y_train, x_val, y_val def random_split(x, y, val_ratio=0.2): """Splits the data into training and validation data. @@ -331,7 +398,7 @@ def saveToCache(cache_file: str, x_train: np.ndarray, y_train: np.ndarray, label os.makedirs(os.path.dirname(cache_file), exist_ok=True) # Save to cache - np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels) + np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels, binary_classification=cfg.BINARY_CLASSIFICATION, multi_label=cfg.MULTI_LABEL) def loadFromCache(cache_file: str): @@ -351,8 +418,10 @@ def loadFromCache(cache_file: str): x_train = cache["x_train"] y_train = cache["y_train"] labels = cache["labels"] + binary_classification = bool(cache["binary_classification"]) + multi_label = bool(cache["multi_label"]) - return x_train, y_train, labels + return x_train, y_train, labels, binary_classification, multi_label def clearErrorLog(): From 1e0a65f463d270c496c17cc5463667416d50eef9 Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Mon, 22 Jan 2024 12:04:03 +0100 Subject: [PATCH 4/6] set from_logits=True for metrics and specify number of labels --- model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/model.py b/model.py index 939e1589..124f4d8f 100644 --- a/model.py +++ b/model.py @@ -161,7 +161,6 @@ def buildLinearClassifier(num_labels, input_size, hidden_units=0, dropout=0.0): return model - def trainLinearClassifier( classifier, x_train, @@ -256,7 +255,10 @@ def on_epoch_end(self, epoch, logs=None): classifier.compile( optimizer=keras.optimizers.Adam(learning_rate=lr_schedule), loss=custom_loss, - metrics=[keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC"), keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC")], + metrics=[ + keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC", num_labels=y_train.shape[1], from_logits=True), + keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC", num_labels=y_train.shape[1], from_logits=True) + ] ) # Train model @@ -266,7 +268,7 @@ def on_epoch_end(self, epoch, logs=None): epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), - callbacks=callbacks, + callbacks=callbacks ) return classifier, history From d7d805f0374f66950886a99be38293adbf20f2a5 Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Tue, 30 Jan 2024 16:56:55 +0100 Subject: [PATCH 5/6] updated readme for multi label and negative samples --- README.adoc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.adoc b/README.adoc index c6edc91c..2a818643 100644 --- a/README.adoc +++ b/README.adoc @@ -734,6 +734,10 @@ python3 analyze.py --classifier checkpoints/custom/Custom_Classifier.tflite + NOTE: Setting a custom classifier will also set the new labels file. Due to these custom labels, the location filter and locale will be disabled. ++ +. You can include negative samples for classes by prefixing the folder names with a '-' (e.g., `-Poecile atricapillus_Black-capped Chickadee`). Do this with samples that definitely do not contain the species. Negative samples will only be used for training and not for validation. Also keep in mind that negative samples will only be used when a corresponding folder with positive samples exists. ++ +. To train with multi-label data separate the class labels with commas in the folder names (e.g., `Poecile atricapillus_Black-capped Chickadee, Cardinalis cardinalis_Northern Cardinal`). This can also be combined with negative samples as described above. The validation split will be performed combination of classes, so you might want to ensure sufficient data for each combination of classes. When using multi-label data the upsampling mode will be limited to 'repeat'. == Funding From e41fdf688ca29a9193de59a745850c6ff7d7d3f0 Mon Sep 17 00:00:00 2001 From: Max Mauermann Date: Wed, 31 Jan 2024 17:09:53 +0100 Subject: [PATCH 6/6] older caches supported and metric still works when not using multilabel --- model.py | 4 ++-- utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/model.py b/model.py index 124f4d8f..d108294c 100644 --- a/model.py +++ b/model.py @@ -256,8 +256,8 @@ def on_epoch_end(self, epoch, logs=None): optimizer=keras.optimizers.Adam(learning_rate=lr_schedule), loss=custom_loss, metrics=[ - keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC", num_labels=y_train.shape[1], from_logits=True), - keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC", num_labels=y_train.shape[1], from_logits=True) + keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True), + keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True) ] ) diff --git a/utils.py b/utils.py index 6b1e9034..7ed5c229 100644 --- a/utils.py +++ b/utils.py @@ -418,8 +418,8 @@ def loadFromCache(cache_file: str): x_train = cache["x_train"] y_train = cache["y_train"] labels = cache["labels"] - binary_classification = bool(cache["binary_classification"]) - multi_label = bool(cache["multi_label"]) + binary_classification = bool(cache["binary_classification"]) if "binary_classification" in cache.keys() else False + multi_label = bool(cache["multi_label"]) if "multi_label" in cache.keys() else False return x_train, y_train, labels, binary_classification, multi_label