Merge pull request #241 from kahst/multi-label-training

Multi label training
kahst · Jan 31, 2024 · 056e749 · 056e749
2 parents 2c2c5fa + e41fdf6
commit 056e749
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 25 deletions.
diff --git a/README.adoc b/README.adoc
@@ -742,6 +742,10 @@ python3 analyze.py --classifier checkpoints/custom/Custom_Classifier.tflite
 +
 NOTE: Setting a custom classifier will also set the new labels file.
 Due to these custom labels, the location filter and locale will be disabled.
++
+. You can include negative samples for classes by prefixing the folder names with a '-' (e.g., `-Poecile atricapillus_Black-capped Chickadee`). Do this with samples that definitely do not contain the species. Negative samples will only be used for training and not for validation. Also keep in mind that negative samples will only be used when a corresponding folder with positive samples exists.
++
+. To train with multi-label data separate the class labels with commas in the folder names (e.g., `Poecile atricapillus_Black-capped Chickadee, Cardinalis cardinalis_Northern Cardinal`). This can also be combined with negative samples as described above. The validation split will be performed combination of classes, so you might want to ensure sufficient data for each combination of classes. When using multi-label data the upsampling mode will be limited to 'repeat'.
 
 == Funding
 

diff --git a/config.py b/config.py
@@ -166,9 +166,14 @@
 # Mutliple executions will be averaged, so the evaluation is more consistent
 AUTOTUNE_EXECUTIONS_PER_TRIAL: int = 1
 
-# If a binary classification model is trained, this value will be detected automatically in the training script
+# If a binary classification model is trained.
+# This value will be detected automatically in the training script, if only one class and a non-event class is used.
 BINARY_CLASSIFICATION: bool = False
 
+# If a model for a multi-label setting is trained.
+# This value will automatically be set, if subfolders in the input direcotry are named with multiple classes separated by commas.
+MULTI_LABEL: bool = False
+
 #####################
 # Misc runtime vars #
 #####################

diff --git a/gui.py b/gui.py
@@ -396,7 +396,14 @@ def select_subdirectories():
     if dir_name:
         subdirs = utils.list_subdirectories(dir_name[0])
 
-        return dir_name[0], [[d] for d in subdirs]
+        labels = []
+        for folder in subdirs:
+            labels_in_folder = folder.split(',')
+            for label in labels_in_folder:
+                if not label in labels and not label.startswith('-') and not label in cfg.NON_EVENT_CLASSES:
+                    labels.append(label)
+
+        return dir_name[0], [[label] for label in sorted(labels)]
 
     return None, None
 

diff --git a/model.py b/model.py
@@ -161,7 +161,6 @@ def buildLinearClassifier(num_labels, input_size, hidden_units=0, dropout=0.0):
 
     return model
 
-
 def trainLinearClassifier(
     classifier,
     x_train,
@@ -214,7 +213,11 @@ def on_epoch_end(self, epoch, logs=None):
     y_train = y_train[idx]
 
     # Random val split
-    x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split)
+    if not cfg.MULTI_LABEL:
+        x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split)
+    else:
+        x_train, y_train, x_val, y_val = utils.random_multilabel_split(x_train, y_train, val_split)
+
     print(
         f"Training on {x_train.shape[0]} samples, validating on {x_val.shape[0]} samples.",
         flush=True,
@@ -252,7 +255,10 @@ def on_epoch_end(self, epoch, logs=None):
     classifier.compile(
         optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
         loss=custom_loss,
-        metrics=[keras.metrics.AUC(curve="PR", multi_label=False, name="AUPRC"), keras.metrics.AUC(curve="ROC", multi_label=False, name="AUROC")],
+        metrics=[
+            keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True),
+            keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True)
+        ]
     )
 
     # Train model
@@ -262,7 +268,7 @@ def on_epoch_end(self, epoch, logs=None):
         epochs=epochs,
         batch_size=batch_size,
         validation_data=(x_val, y_val),
-        callbacks=callbacks,
+        callbacks=callbacks
     )
 
     return classifier, history

diff --git a/train.py b/train.py
@@ -32,48 +32,78 @@ def _loadTrainingData(cache_mode="none", cache_file=""):
     if cache_mode == "load":
         if os.path.isfile(cache_file):
             print(f"\t...loading from cache: {cache_file}", flush=True)
-            x_train, y_train, labels = utils.loadFromCache(cache_file)
+            x_train, y_train, labels, cfg.BINARY_CLASSIFICATION, cfg.MULTI_LABEL = utils.loadFromCache(cache_file)
             return x_train, y_train, labels
         else:
             print(f"\t...cache file not found: {cache_file}", flush=True)
 
     # Get list of subfolders as labels
-    labels = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH)))
+    folders = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH)))
+
+
+    # Read all individual labels from the folder names
+    labels = []
+
+    for folder in folders:
+        labels_in_folder = folder.split(',')
+        for label in labels_in_folder:
+            if not label in labels:
+                labels.append(label)
+
+    # Sort labels
+    labels = list(sorted(labels))
 
     # Get valid labels
     valid_labels = [l for l in labels if not l.lower() in cfg.NON_EVENT_CLASSES and not l.startswith("-")] 
 
+    # Check if binary classification
     cfg.BINARY_CLASSIFICATION = len(valid_labels) == 1
 
+    # Validate the classes for binary classification
     if cfg.BINARY_CLASSIFICATION:
-        if len([l for l in labels if l.startswith("-")]) > 0:
-            raise Exception("negative labels cant be used with binary classification")
-        if len([l for l in labels if l in cfg.NON_EVENT_CLASSES]) == 0:
-            raise Exception("non-event samples are required for binary classification")
+        if len([l for l in folders if l.startswith("-")]) > 0:
+            raise Exception("Negative labels cant be used with binary classification")
+        if len([l for l in folders if l in cfg.NON_EVENT_CLASSES]) == 0:
+            raise Exception("Non-event samples are required for binary classification")
+
+    # Check if multi label
+    cfg.MULTI_LABEL = len(valid_labels) > 1 and any(',' in f for f in folders)
+
+    # Check if multi-label and binary classficication 
+    if cfg.BINARY_CLASSIFICATION and cfg.MULTI_LABEL:
+        raise Exception("Error: Binary classfication and multi-label not possible at the same time")
+
+    # Only allow repeat upsampling for multi-label setting
+    if cfg.MULTI_LABEL and cfg.UPSAMPLING_RATIO > 0 and cfg.UPSAMPLING_MODE != 'repeat':
+        raise Exception("Only repeat-upsampling ist available for multi-label")
 
     # Load training data
     x_train = []
     y_train = []
 
-    for label in labels:
+    for folder in folders:
 
-        # Current label
-        print(f"\t- {label}", flush=True)
+        # Current folder
+        print(f"\t- {folder}", flush=True)
 
         # Get label vector
         label_vector = np.zeros((len(valid_labels),), dtype="float32")
-        if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"):
-            label_vector[valid_labels.index(label)] = 1
-        elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels
-            label_vector[valid_labels.index(label[1:])] = -1
+
+        folder_labels = folder.split(',')
+
+        for label in folder_labels:
+            if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"):
+                label_vector[valid_labels.index(label)] = 1
+            elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels
+                label_vector[valid_labels.index(label[1:])] = -1
 
         # Get list of files
         # Filter files that start with '.' because macOS seems to them for temp files.
         files = filter(
             os.path.isfile,
             (
-                os.path.join(cfg.TRAIN_DATA_PATH, label, f)
-                for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, label)))
+                os.path.join(cfg.TRAIN_DATA_PATH, folder, f)
+                for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, folder)))
                 if not f.startswith(".") and f.rsplit(".", 1)[-1].lower() in cfg.ALLOWED_FILETYPES
             ),
         )
@@ -87,7 +117,7 @@ def _loadTrainingData(cache_mode="none", cache_file=""):
 
             # if anything happens print the error and ignore the file
             except Exception as e:
-                # Current label
+                # Print Error
                 print(f"\t Error when loading file {f}", flush=True)
                 continue
 
@@ -161,6 +191,11 @@ def run_trial(self, trial, *args, **kwargs):
                                                         dropout=hp.Choice("dropout", [0.0, 0.25, 0.33, 0.5, 0.75, 0.9], default=cfg.TRAIN_DROPOUT))
                 print("...Done.", flush=True)
 
+                # Only allow repeat upsampling in multi-label setting
+                upsampling_choices = ['repeat', 'mean', 'linear'] #SMOTE is too slow
+                if cfg.MULTI_LABEL:
+                    upsampling_choices = ['repeat']
+
                 # Train model
                 print("Training model...", flush=True)
                 classifier, history = model.trainLinearClassifier(
@@ -172,7 +207,7 @@ def run_trial(self, trial, *args, **kwargs):
                     learning_rate=hp.Choice("learning_rate", [0.1, 0.01, 0.005, 0.002, 0.001, 0.0005, 0.0002, 0.0001], default=cfg.TRAIN_LEARNING_RATE),
                     val_split=cfg.TRAIN_VAL_SPLIT,
                     upsampling_ratio=hp.Choice("upsampling_ratio",[0.0, 0.25, 0.33, 0.5, 0.75, 1.0], default=cfg.UPSAMPLING_RATIO),
-                    upsampling_mode=hp.Choice("upsampling_mode", ['repeat', 'mean', 'linear'], default=cfg.UPSAMPLING_MODE), #SMOTE is too slow
+                    upsampling_mode=hp.Choice("upsampling_mode", upsampling_choices, default=cfg.UPSAMPLING_MODE), 
                     train_with_mixup=hp.Boolean("mixup", default=cfg.TRAIN_WITH_MIXUP),
                     train_with_label_smoothing=hp.Boolean("label_smoothing", default=cfg.TRAIN_WITH_LABEL_SMOOTHING),
                 )

diff --git a/utils.py b/utils.py
@@ -56,6 +56,73 @@ def list_subdirectories(path: str):
     """
     return filter(lambda el: os.path.isdir(os.path.join(path, el)), os.listdir(path))
 
+def random_multilabel_split(x, y, val_ratio=0.2):
+    """Splits the data into training and validation data.
+
+    Makes sure that each combination of classes is represented in both sets.
+
+    Args:
+        x: Samples.
+        y: One-hot labels.
+        val_ratio: The ratio of validation data.
+
+    Returns:
+        A tuple of (x_train, y_train, x_val, y_val).
+    
+    """
+
+    # Set numpy random seed
+    np.random.seed(cfg.RANDOM_SEED)
+
+    # Find all combinations of labels
+    class_combinations = np.unique(y, axis=0)
+
+    # Initialize training and validation data
+    x_train, y_train, x_val, y_val = [], [], [], []
+
+    # Split the data for each combination of labels
+    for class_combination in class_combinations:
+        # find all indices
+        indices = np.where((y == class_combination).all(axis=1))[0]
+
+        # When negative sample use only for training
+        if -1 in class_combination:
+            x_train.append(x[indices])
+            y_train.append(y[indices])
+        # Otherwise split according to the validation split
+        else:
+            # Get number of samples for each set
+            num_samples = len(indices)
+            num_samples_train = max(1, int(num_samples * (1 - val_ratio)))
+            num_samples_val = max(0, num_samples - num_samples_train)
+            # Randomly choose samples for training and validation
+            np.random.shuffle(indices)
+            train_indices = indices[:num_samples_train]
+            val_indices = indices[num_samples_train:num_samples_train + num_samples_val]
+            # Append samples to training and validation data
+            x_train.append(x[train_indices])
+            y_train.append(y[train_indices])
+            x_val.append(x[val_indices])
+            y_val.append(y[val_indices])
+
+    # Concatenate data
+    x_train = np.concatenate(x_train)
+    y_train = np.concatenate(y_train)
+    x_val = np.concatenate(x_val)
+    y_val = np.concatenate(y_val)
+
+    # Shuffle data
+    indices = np.arange(len(x_train))
+    np.random.shuffle(indices)
+    x_train = x_train[indices]
+    y_train = y_train[indices]
+
+    indices = np.arange(len(x_val))
+    np.random.shuffle(indices)
+    x_val = x_val[indices]
+    y_val = y_val[indices]
+
+    return x_train, y_train, x_val, y_val       
 
 def random_split(x, y, val_ratio=0.2):
     """Splits the data into training and validation data.
@@ -331,7 +398,7 @@ def saveToCache(cache_file: str, x_train: np.ndarray, y_train: np.ndarray, label
     os.makedirs(os.path.dirname(cache_file), exist_ok=True)
 
     # Save to cache
-    np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels)
+    np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels, binary_classification=cfg.BINARY_CLASSIFICATION, multi_label=cfg.MULTI_LABEL)
 
 
 def loadFromCache(cache_file: str):
@@ -351,8 +418,10 @@ def loadFromCache(cache_file: str):
     x_train = cache["x_train"]
     y_train = cache["y_train"]
     labels = cache["labels"]
+    binary_classification = bool(cache["binary_classification"]) if "binary_classification" in cache.keys() else False
+    multi_label = bool(cache["multi_label"]) if "multi_label" in cache.keys() else False
 
-    return x_train, y_train, labels
+    return x_train, y_train, labels, binary_classification, multi_label
 
 
 def clearErrorLog():