From 907113538ffd872bdafddd88ec80ae60266623a1 Mon Sep 17 00:00:00 2001 From: John Schreck Date: Thu, 11 Jul 2024 14:26:33 -0600 Subject: [PATCH] Updated handling of metrics calculations during predict/validation --- applications/predict_regressor_torch.py | 2 +- applications/torch_dataset/dataset.py | 2 +- applications/train_regressor_torch.py | 16 ++++++--- mlguess/regression_metrics.py | 31 ++++++++-------- mlguess/torch/models.py | 6 ++-- mlguess/torch/regression_losses.py | 15 ++++---- mlguess/torch/trainer.py | 47 +++++++++++++------------ 7 files changed, 62 insertions(+), 57 deletions(-) diff --git a/applications/predict_regressor_torch.py b/applications/predict_regressor_torch.py index 47632a8..01c8301 100644 --- a/applications/predict_regressor_torch.py +++ b/applications/predict_regressor_torch.py @@ -187,7 +187,7 @@ def main(rank, world_size, conf, trial=False): if split == "train": df = train_loader.dataset.train_data elif split == "valid": - df = train_loader.dataset.valid_data + df = valid_loader.dataset.valid_data elif split == "test": df = train_loader.dataset.test_data diff --git a/applications/torch_dataset/dataset.py b/applications/torch_dataset/dataset.py index 45639ea..32f858c 100644 --- a/applications/torch_dataset/dataset.py +++ b/applications/torch_dataset/dataset.py @@ -43,7 +43,7 @@ def __init__(self, conf, split='train'): # Compute var on the total training data set self.training_var = [ - np.var(self.y_scaler.transform(train_data[output_cols])) + np.var(self.y_scaler.transform(train_data[output_cols])[:, i]) for i in range(self.train_data[output_cols].shape[-1]) ] diff --git a/applications/train_regressor_torch.py b/applications/train_regressor_torch.py index e70a49e..11db252 100644 --- a/applications/train_regressor_torch.py +++ b/applications/train_regressor_torch.py @@ -26,7 +26,7 @@ TorchFSDPCheckpointIO ) from mlguess.torch.trainer import Trainer -from mlguess.torch.regression_losses import LipschitzMSELoss +from mlguess.torch.regression_losses import LipschitzMSELoss, EvidentialRegressionLoss from mlguess.torch.models import seed_everything, DNN from mlguess.regression_metrics import regression_metrics @@ -69,7 +69,7 @@ def load_dataset_and_sampler(conf, world_size, rank, is_train, seed=42): rank=rank, seed=seed, shuffle=is_train, - drop_last=True + drop_last=(not is_train) ) flag = 'training' if is_train else 'validation' logging.info(f"Loaded a {flag} torch dataset, and a distributed sampler") @@ -187,9 +187,9 @@ def main(rank, world_size, conf, trial=False): batch_size=valid_batch_size, shuffle=False, sampler=valid_sampler, - pin_memory=False, + pin_memory=True, num_workers=valid_thread_workers, - drop_last=True + drop_last=False ) # model @@ -213,6 +213,8 @@ def main(rank, world_size, conf, trial=False): model, optimizer, scheduler, scaler = load_model_states_and_optimizer(conf, model, device) # Train and validation losses + # train_criterion = EvidentialRegressionLoss(coef=10.84134458514458) + # valid_criterion = EvidentialRegressionLoss(coef=10.84134458514458) train_criterion = LipschitzMSELoss(**conf["train_loss"]) valid_criterion = LipschitzMSELoss(**conf["valid_loss"]) @@ -248,6 +250,10 @@ def __init__(self, config, metric="val_loss", device="cpu"): def train(self, trial, conf): + conf['trainer']['train_batch_size'] = conf['data']['batch_size'] + conf['trainer']['valid_batch_size'] = conf['data']['batch_size'] + conf['valid_loss']['factor'] = conf['train_loss']['factor'] + try: return main(0, 1, conf, trial=trial) @@ -257,7 +263,7 @@ def train(self, trial, conf): f"Pruning trial {trial.number} due to CUDA memory overflow: {str(E)}." ) raise optuna.TrialPruned() - elif "non-singleton" in str(E): + elif "non-singleton" in str(E) or "nan" in str(E): logging.warning( f"Pruning trial {trial.number} due to shape mismatch: {str(E)}." ) diff --git a/mlguess/regression_metrics.py b/mlguess/regression_metrics.py index 29bf101..9271497 100644 --- a/mlguess/regression_metrics.py +++ b/mlguess/regression_metrics.py @@ -67,42 +67,41 @@ def regression_metrics(y_true, y_pred, total=None, split="val"): # metrics[f"{split}_crps_ss"] = r2_score(result['bin'], result['crps'], sample_weight=result["count"]) # metrics[f"{split}_rmse_ss"] = r2_score(result['bin'], result['rmse'], sample_weight=result["count"]) - rmse_ss = rmse_crps_skill_scores(y_true, y_pred, total, filter_top_percentile=5) + rmse_ss = rmse_crps_skill_scores(y_true, y_pred, total, filter_top_percentile=0) metrics[f"{split}_r2_rmse_sigma"] = rmse_ss["r2_rmse"] metrics[f"{split}_r2_crps_sigma"] = rmse_ss["r2_crps"] return metrics -def rmse_crps_skill_scores(y_true, y_pred, total, filter_top_percentile=0): - # Initialize dictionaries to store r2_rmse and r2_crps for each column +def rmse_crps_skill_scores(y, mu, total, filter_top_percentile=0): + # Create a grid of subplots with the number of rows determined by the length of output_cols r2_rmse_dict = {} r2_crps_dict = {} - # Get the number of columns from y_pred - num_cols = y_pred.shape[1] - - # Loop over the columns - for col in range(num_cols): + # Loop over the length of output_cols + num_cols = y.shape[-1] + for col in range(y.shape[-1]): result = calculate_skill_score( - y_true[:, col], # Use y_true for the true values - y_pred[:, col], # Use y_pred for the predicted values - total[:, col], + y[:, col], + mu[:, col], + np.sqrt(total)[:, col], num_bins=100, log=True, filter_top_percentile=filter_top_percentile ) + r2_rmse = r2_score(result['bin'], result['rmse']) r2_crps = r2_score(result['bin'], result['crps']) r2_rmse_dict[col] = r2_rmse r2_crps_dict[col] = r2_crps - if np.isnan(r2_rmse): - r2_rmse = -10 + # if np.isnan(r2_rmse): + # r2_rmse = -10 - # Check if r2_crps is NaN and replace it with -10 - if np.isnan(r2_crps): - r2_crps = -10 + # # Check if r2_crps is NaN and replace it with -10 + # if np.isnan(r2_crps): + # r2_crps = -10 # Calculate the average of r2_rmse and r2_crps avg_r2_rmse = sum(r2_rmse_dict.values()) / num_cols diff --git a/mlguess/torch/models.py b/mlguess/torch/models.py index 4abb2ba..86a379b 100644 --- a/mlguess/torch/models.py +++ b/mlguess/torch/models.py @@ -168,9 +168,9 @@ def predict_uncertainty(self, input, y_scaler=None): if y_scaler: mu = y_scaler.inverse_transform(mu) - for i in range(mu.shape[-1]): - aleatoric[:, i] *= self.training_var[i] - epistemic[:, i] *= self.training_var[i] + for i in range(mu.shape[-1]): + aleatoric[:, i] *= self.training_var[i] + epistemic[:, i] *= self.training_var[i] return mu, aleatoric, epistemic diff --git a/mlguess/torch/regression_losses.py b/mlguess/torch/regression_losses.py index bb52ef2..1ad1c76 100644 --- a/mlguess/torch/regression_losses.py +++ b/mlguess/torch/regression_losses.py @@ -41,9 +41,8 @@ def normal_inverse_gamma_reg(self, y, gamma, v, alpha, beta): evi = 2 * v + alpha return error * evi - def __call__(self, y, pred): + def __call__(self, gamma, v, alpha, beta, y): """Calculate the Evidential Regression Loss""" - gamma, v, alpha, beta = pred loss_nll = self.normal_inverse_gamma_nll(y, gamma, v, alpha, beta) loss_reg = self.normal_inverse_gamma_reg(y, gamma, v, alpha, beta) return loss_nll.mean() + self.coef * loss_reg.mean() @@ -206,9 +205,9 @@ class EvidenceRegularizer(torch.nn.modules.loss._Loss): Reference: https://www.mit.edu/~amini/pubs/pdf/deep-evidential-regression.pdf Source: https://github.com/deargen/MT-ENet/tree/468822188f52e517b1ee8e386eea607b2b7d8829 """ - def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', factor=0.1): + def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', coef=0.1): super(EvidenceRegularizer, self).__init__(size_average, reduce, reduction) - self.factor = factor + self.coef = coef def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, target: torch.Tensor) -> torch.Tensor: @@ -224,7 +223,7 @@ def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, Loss = |y - gamma|*(2*nu + alpha) * factor """ - loss_value = torch.abs(target - gamma)*(2*nu + alpha) * self.factor + loss_value = torch.abs(target - gamma)*(2*nu + alpha) * self.coef if self.reduction == 'mean': return loss_value.mean() elif self.reduction == 'sum': @@ -234,13 +233,13 @@ def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, class LipschitzMSELoss(torch.nn.Module): - def __init__(self, tol=1e-8, factor=0.1, reduction='mean'): + def __init__(self, tol=1e-8, coef=0.1, reduction='mean'): super(LipschitzMSELoss, self).__init__() self.tol = tol - self.factor = factor + self.coef = coef self.reduction = reduction self.evidential_marginal_likelihood = EvidentialMarginalLikelihood(reduction=reduction) - self.evidence_regularizer = EvidenceRegularizer(factor=factor, reduction=reduction) + self.evidence_regularizer = EvidenceRegularizer(coef=coef, reduction=reduction) def forward(self, gamma, nu, alpha, beta, target): loss = self.evidential_marginal_likelihood(gamma, nu, alpha, beta, target) diff --git a/mlguess/torch/trainer.py b/mlguess/torch/trainer.py index 3cd3e46..fac7f4a 100644 --- a/mlguess/torch/trainer.py +++ b/mlguess/torch/trainer.py @@ -81,7 +81,6 @@ def train_one_epoch( commit_loss = 0.0 with autocast(enabled=amp): - x = x.to(self.device) y_pred = self.model(x) gamma, nu, alpha, beta = y_pred @@ -91,7 +90,7 @@ def train_one_epoch( # Metrics y_pred = (_.cpu().detach() for _ in y_pred) mu, ale, epi = self.model.predict_uncertainty(y_pred, y_scaler=transform) - total = np.sqrt(ale + epi) + total = ale + epi if transform: y = transform.inverse_transform(y.cpu()) metrics_dict = metrics(y, mu, total, split="train") @@ -192,7 +191,7 @@ def validate( # Metrics y_pred = (_.cpu() for _ in y_pred) mu, ale, epi = self.model.predict_uncertainty(y_pred, y_scaler=transform) - total = np.sqrt(ale + epi) + total = ale + epi if transform: y = transform.inverse_transform(y.cpu()) metrics_dict = metrics(y, mu, total, split="valid") @@ -235,20 +234,14 @@ def validate( def predict(self, conf, test_loader, criterion, metrics, transform=None, split=None): self.model.eval() - valid_batches_per_epoch = conf['trainer']['valid_batches_per_epoch'] distributed = True if conf["trainer"]["mode"] in ["fsdp", "ddp"] else False results_dict = defaultdict(list) mu_list, ale_list, epi_list, y_list = [], [], [], [] - # Set up a custom tqdm - valid_batches_per_epoch = ( - valid_batches_per_epoch if 0 < valid_batches_per_epoch < len(test_loader) else len(test_loader) - ) - batch_group_generator = tqdm.tqdm( enumerate(test_loader), - total=valid_batches_per_epoch, + total=len(test_loader), leave=True, disable=True if self.rank > 0 else False ) @@ -273,21 +266,18 @@ def predict(self, conf, test_loader, criterion, metrics, transform=None, split=N batch_loss = torch.Tensor([loss.item()]).cuda(self.device) if distributed: torch.distributed.barrier() - results_dict["loss"].append(batch_loss[0].item()) + results_dict[f"{split}_loss"].append(batch_loss[0].item()) # Print to tqdm - to_print = f"{split} loss: {np.mean(results_dict['loss']):.6f}" + to_print = f'{split} loss: {np.mean(results_dict[f"{split}_loss"]):.6f}' if self.rank == 0: batch_group_generator.set_description(to_print) - if i >= valid_batches_per_epoch and i > 0: - break - # Concatenate arrays mu = np.concatenate(mu_list, axis=0) ale = np.concatenate(ale_list, axis=0) epi = np.concatenate(epi_list, axis=0) - total = np.sqrt(ale + epi) + total = ale + epi y = np.concatenate(y_list, axis=0) if transform: @@ -300,7 +290,7 @@ def predict(self, conf, test_loader, criterion, metrics, transform=None, split=N if distributed: dist.all_reduce(value, dist.ReduceOp.AVG, async_op=False) results_dict[name].append(value[0].item()) - results_dict["loss"] = np.mean(results_dict["loss"]) + results_dict[f"{split}_loss"].append(np.mean(results_dict[f"{split}_loss"])) # Shutdown the progbar batch_group_generator.close() @@ -393,14 +383,25 @@ def fit( else: - valid_results = self.validate( - epoch, + valid_results = self.predict( conf, valid_loader, valid_criterion, metrics, - transform - ) + transform, + split="valid" + )["metrics"] + + # this version of validation computes metrics batch-by-batch, which may affect metrics computed through binning + + # valid_results = self.validate( + # epoch, + # conf, + # valid_loader, + # valid_criterion, + # metrics, + # transform + # ) ################# # @@ -510,8 +511,8 @@ def fit( gc.collect() # Report result to the trial - if trial: - trial.report(results_dict[training_metric][-1], step=epoch) + # if trial: + # trial.report(results_dict[training_metric][-1], step=epoch) # Stop training if we have not improved after X epochs (stopping patience) best_epoch = [