Skip to content

Commit

Permalink
Merge pull request #241 from kahst/multi-label-training
Browse files Browse the repository at this point in the history
Multi label training
  • Loading branch information
max-mauermann authored Jan 31, 2024
2 parents 2c2c5fa + e41fdf6 commit 056e749
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 25 deletions.
4 changes: 4 additions & 0 deletions README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,10 @@ python3 analyze.py --classifier checkpoints/custom/Custom_Classifier.tflite
+
NOTE: Setting a custom classifier will also set the new labels file.
Due to these custom labels, the location filter and locale will be disabled.
+
. You can include negative samples for classes by prefixing the folder names with a '-' (e.g., `-Poecile atricapillus_Black-capped Chickadee`). Do this with samples that definitely do not contain the species. Negative samples will only be used for training and not for validation. Also keep in mind that negative samples will only be used when a corresponding folder with positive samples exists.
+
. To train with multi-label data separate the class labels with commas in the folder names (e.g., `Poecile atricapillus_Black-capped Chickadee, Cardinalis cardinalis_Northern Cardinal`). This can also be combined with negative samples as described above. The validation split will be performed combination of classes, so you might want to ensure sufficient data for each combination of classes. When using multi-label data the upsampling mode will be limited to 'repeat'.

== Funding

Expand Down
7 changes: 6 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,14 @@
# Mutliple executions will be averaged, so the evaluation is more consistent
AUTOTUNE_EXECUTIONS_PER_TRIAL: int = 1

# If a binary classification model is trained, this value will be detected automatically in the training script
# If a binary classification model is trained.
# This value will be detected automatically in the training script, if only one class and a non-event class is used.
BINARY_CLASSIFICATION: bool = False

# If a model for a multi-label setting is trained.
# This value will automatically be set, if subfolders in the input direcotry are named with multiple classes separated by commas.
MULTI_LABEL: bool = False

#####################
# Misc runtime vars #
#####################
Expand Down
9 changes: 8 additions & 1 deletion gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,14 @@ def select_subdirectories():
if dir_name:
subdirs = utils.list_subdirectories(dir_name[0])

return dir_name[0], [[d] for d in subdirs]
labels = []
for folder in subdirs:
labels_in_folder = folder.split(',')
for label in labels_in_folder:
if not label in labels and not label.startswith('-') and not label in cfg.NON_EVENT_CLASSES:
labels.append(label)

return dir_name[0], [[label] for label in sorted(labels)]

return None, None

Expand Down
14 changes: 10 additions & 4 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@ def buildLinearClassifier(num_labels, input_size, hidden_units=0, dropout=0.0):

return model


def trainLinearClassifier(
classifier,
x_train,
Expand Down Expand Up @@ -214,7 +213,11 @@ def on_epoch_end(self, epoch, logs=None):
y_train = y_train[idx]

# Random val split
x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split)
if not cfg.MULTI_LABEL:
x_train, y_train, x_val, y_val = utils.random_split(x_train, y_train, val_split)
else:
x_train, y_train, x_val, y_val = utils.random_multilabel_split(x_train, y_train, val_split)

print(
f"Training on {x_train.shape[0]} samples, validating on {x_val.shape[0]} samples.",
flush=True,
Expand Down Expand Up @@ -252,7 +255,10 @@ def on_epoch_end(self, epoch, logs=None):
classifier.compile(
optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
loss=custom_loss,
metrics=[keras.metrics.AUC(curve="PR", multi_label=False, name="AUPRC"), keras.metrics.AUC(curve="ROC", multi_label=False, name="AUROC")],
metrics=[
keras.metrics.AUC(curve="PR", multi_label=cfg.MULTI_LABEL, name="AUPRC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True),
keras.metrics.AUC(curve="ROC", multi_label=cfg.MULTI_LABEL, name="AUROC", num_labels=y_train.shape[1] if cfg.MULTI_LABEL else None, from_logits=True)
]
)

# Train model
Expand All @@ -262,7 +268,7 @@ def on_epoch_end(self, epoch, logs=None):
epochs=epochs,
batch_size=batch_size,
validation_data=(x_val, y_val),
callbacks=callbacks,
callbacks=callbacks
)

return classifier, history
Expand Down
69 changes: 52 additions & 17 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,48 +32,78 @@ def _loadTrainingData(cache_mode="none", cache_file=""):
if cache_mode == "load":
if os.path.isfile(cache_file):
print(f"\t...loading from cache: {cache_file}", flush=True)
x_train, y_train, labels = utils.loadFromCache(cache_file)
x_train, y_train, labels, cfg.BINARY_CLASSIFICATION, cfg.MULTI_LABEL = utils.loadFromCache(cache_file)
return x_train, y_train, labels
else:
print(f"\t...cache file not found: {cache_file}", flush=True)

# Get list of subfolders as labels
labels = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH)))
folders = list(sorted(utils.list_subdirectories(cfg.TRAIN_DATA_PATH)))


# Read all individual labels from the folder names
labels = []

for folder in folders:
labels_in_folder = folder.split(',')
for label in labels_in_folder:
if not label in labels:
labels.append(label)

# Sort labels
labels = list(sorted(labels))

# Get valid labels
valid_labels = [l for l in labels if not l.lower() in cfg.NON_EVENT_CLASSES and not l.startswith("-")]

# Check if binary classification
cfg.BINARY_CLASSIFICATION = len(valid_labels) == 1

# Validate the classes for binary classification
if cfg.BINARY_CLASSIFICATION:
if len([l for l in labels if l.startswith("-")]) > 0:
raise Exception("negative labels cant be used with binary classification")
if len([l for l in labels if l in cfg.NON_EVENT_CLASSES]) == 0:
raise Exception("non-event samples are required for binary classification")
if len([l for l in folders if l.startswith("-")]) > 0:
raise Exception("Negative labels cant be used with binary classification")
if len([l for l in folders if l in cfg.NON_EVENT_CLASSES]) == 0:
raise Exception("Non-event samples are required for binary classification")

# Check if multi label
cfg.MULTI_LABEL = len(valid_labels) > 1 and any(',' in f for f in folders)

# Check if multi-label and binary classficication
if cfg.BINARY_CLASSIFICATION and cfg.MULTI_LABEL:
raise Exception("Error: Binary classfication and multi-label not possible at the same time")

# Only allow repeat upsampling for multi-label setting
if cfg.MULTI_LABEL and cfg.UPSAMPLING_RATIO > 0 and cfg.UPSAMPLING_MODE != 'repeat':
raise Exception("Only repeat-upsampling ist available for multi-label")

# Load training data
x_train = []
y_train = []

for label in labels:
for folder in folders:

# Current label
print(f"\t- {label}", flush=True)
# Current folder
print(f"\t- {folder}", flush=True)

# Get label vector
label_vector = np.zeros((len(valid_labels),), dtype="float32")
if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"):
label_vector[valid_labels.index(label)] = 1
elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels
label_vector[valid_labels.index(label[1:])] = -1

folder_labels = folder.split(',')

for label in folder_labels:
if not label.lower() in cfg.NON_EVENT_CLASSES and not label.startswith("-"):
label_vector[valid_labels.index(label)] = 1
elif label.startswith("-") and label[1:] in valid_labels: # Negative labels need to be contained in the valid labels
label_vector[valid_labels.index(label[1:])] = -1

# Get list of files
# Filter files that start with '.' because macOS seems to them for temp files.
files = filter(
os.path.isfile,
(
os.path.join(cfg.TRAIN_DATA_PATH, label, f)
for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, label)))
os.path.join(cfg.TRAIN_DATA_PATH, folder, f)
for f in sorted(os.listdir(os.path.join(cfg.TRAIN_DATA_PATH, folder)))
if not f.startswith(".") and f.rsplit(".", 1)[-1].lower() in cfg.ALLOWED_FILETYPES
),
)
Expand All @@ -87,7 +117,7 @@ def _loadTrainingData(cache_mode="none", cache_file=""):

# if anything happens print the error and ignore the file
except Exception as e:
# Current label
# Print Error
print(f"\t Error when loading file {f}", flush=True)
continue

Expand Down Expand Up @@ -161,6 +191,11 @@ def run_trial(self, trial, *args, **kwargs):
dropout=hp.Choice("dropout", [0.0, 0.25, 0.33, 0.5, 0.75, 0.9], default=cfg.TRAIN_DROPOUT))
print("...Done.", flush=True)

# Only allow repeat upsampling in multi-label setting
upsampling_choices = ['repeat', 'mean', 'linear'] #SMOTE is too slow
if cfg.MULTI_LABEL:
upsampling_choices = ['repeat']

# Train model
print("Training model...", flush=True)
classifier, history = model.trainLinearClassifier(
Expand All @@ -172,7 +207,7 @@ def run_trial(self, trial, *args, **kwargs):
learning_rate=hp.Choice("learning_rate", [0.1, 0.01, 0.005, 0.002, 0.001, 0.0005, 0.0002, 0.0001], default=cfg.TRAIN_LEARNING_RATE),
val_split=cfg.TRAIN_VAL_SPLIT,
upsampling_ratio=hp.Choice("upsampling_ratio",[0.0, 0.25, 0.33, 0.5, 0.75, 1.0], default=cfg.UPSAMPLING_RATIO),
upsampling_mode=hp.Choice("upsampling_mode", ['repeat', 'mean', 'linear'], default=cfg.UPSAMPLING_MODE), #SMOTE is too slow
upsampling_mode=hp.Choice("upsampling_mode", upsampling_choices, default=cfg.UPSAMPLING_MODE),
train_with_mixup=hp.Boolean("mixup", default=cfg.TRAIN_WITH_MIXUP),
train_with_label_smoothing=hp.Boolean("label_smoothing", default=cfg.TRAIN_WITH_LABEL_SMOOTHING),
)
Expand Down
73 changes: 71 additions & 2 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,73 @@ def list_subdirectories(path: str):
"""
return filter(lambda el: os.path.isdir(os.path.join(path, el)), os.listdir(path))

def random_multilabel_split(x, y, val_ratio=0.2):
"""Splits the data into training and validation data.
Makes sure that each combination of classes is represented in both sets.
Args:
x: Samples.
y: One-hot labels.
val_ratio: The ratio of validation data.
Returns:
A tuple of (x_train, y_train, x_val, y_val).
"""

# Set numpy random seed
np.random.seed(cfg.RANDOM_SEED)

# Find all combinations of labels
class_combinations = np.unique(y, axis=0)

# Initialize training and validation data
x_train, y_train, x_val, y_val = [], [], [], []

# Split the data for each combination of labels
for class_combination in class_combinations:
# find all indices
indices = np.where((y == class_combination).all(axis=1))[0]

# When negative sample use only for training
if -1 in class_combination:
x_train.append(x[indices])
y_train.append(y[indices])
# Otherwise split according to the validation split
else:
# Get number of samples for each set
num_samples = len(indices)
num_samples_train = max(1, int(num_samples * (1 - val_ratio)))
num_samples_val = max(0, num_samples - num_samples_train)
# Randomly choose samples for training and validation
np.random.shuffle(indices)
train_indices = indices[:num_samples_train]
val_indices = indices[num_samples_train:num_samples_train + num_samples_val]
# Append samples to training and validation data
x_train.append(x[train_indices])
y_train.append(y[train_indices])
x_val.append(x[val_indices])
y_val.append(y[val_indices])

# Concatenate data
x_train = np.concatenate(x_train)
y_train = np.concatenate(y_train)
x_val = np.concatenate(x_val)
y_val = np.concatenate(y_val)

# Shuffle data
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
y_train = y_train[indices]

indices = np.arange(len(x_val))
np.random.shuffle(indices)
x_val = x_val[indices]
y_val = y_val[indices]

return x_train, y_train, x_val, y_val

def random_split(x, y, val_ratio=0.2):
"""Splits the data into training and validation data.
Expand Down Expand Up @@ -331,7 +398,7 @@ def saveToCache(cache_file: str, x_train: np.ndarray, y_train: np.ndarray, label
os.makedirs(os.path.dirname(cache_file), exist_ok=True)

# Save to cache
np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels)
np.savez_compressed(cache_file, x_train=x_train, y_train=y_train, labels=labels, binary_classification=cfg.BINARY_CLASSIFICATION, multi_label=cfg.MULTI_LABEL)


def loadFromCache(cache_file: str):
Expand All @@ -351,8 +418,10 @@ def loadFromCache(cache_file: str):
x_train = cache["x_train"]
y_train = cache["y_train"]
labels = cache["labels"]
binary_classification = bool(cache["binary_classification"]) if "binary_classification" in cache.keys() else False
multi_label = bool(cache["multi_label"]) if "multi_label" in cache.keys() else False

return x_train, y_train, labels
return x_train, y_train, labels, binary_classification, multi_label


def clearErrorLog():
Expand Down

0 comments on commit 056e749

Please sign in to comment.