From d5d38963e8d5f535c55d927f6599bd5fb50ca667 Mon Sep 17 00:00:00 2001 From: voldemortX Date: Thu, 3 Jun 2021 19:01:19 +0800 Subject: [PATCH 1/8] doc --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 8423ea8..1b59c3e 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,11 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema ## News +### 2021.6.3 + +**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6. +Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang). + ### 2021.2.10 A slight backbone architecture difference in the segmentation task has just been identified and described in Acknowledgement. From 6bc50737d331510e018317244ea4a6437c776d79 Mon Sep 17 00:00:00 2001 From: voldemortX Date: Thu, 3 Jun 2021 19:01:39 +0800 Subject: [PATCH 2/8] upgrade for segmentation main --- segmentation/main.py | 65 ++++++++++++++++++++------------ segmentation/utils/common.py | 25 ++++++------ segmentation/utils/transforms.py | 2 +- 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/segmentation/main.py b/segmentation/main.py index b14c7fd..713a654 100644 --- a/segmentation/main.py +++ b/segmentation/main.py @@ -14,11 +14,12 @@ from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \ ZeroPad, Compose, RandomScale from torch.utils.tensorboard import SummaryWriter -from apex import amp from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \ num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \ load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city from utils.losses import DynamicMutualLoss +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean, @@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne if with_sup: iter_sup = iter(loader_sup) + if is_mixed_precision: + scaler = GradScaler() + # Training running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0} while epoch < num_epochs: @@ -202,22 +206,24 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() - outputs = net(inputs)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) - conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) + with autocast(is_mixed_precision): + outputs = net(inputs)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) + conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) - if with_sup: - loss, stats = criterion(outputs, probs, inputs_c.shape[0]) - else: - loss, stats = criterion(outputs, labels) + if with_sup: + loss, stats = criterion(outputs, probs, inputs_c.shape[0]) + else: + loss, stats = criterion(outputs, labels) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() - optimizer.step() + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() # Logging @@ -238,7 +244,8 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net, num_classes=num_classes, categories=categories, - output_size=input_sizes[2]) + output_size=input_sizes[2], + is_mixed_precision=is_mixed_precision) writer.add_scalar(tensorboard_prefix + 'test pixel accuracy', test_pixel_accuracy, current_step_num) @@ -282,16 +289,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Copied and modified from torch/vision/references/segmentation -def test_one_set(loader, device, net, categories, num_classes, output_size): +def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision): # Evaluate on 1 data_loader (cudnn impact < 0.003%) net.eval() conf_mat = ConfusionMatrix(num_classes) with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image)['out'] - output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) - conf_mat.update(target.flatten(), output.argmax(1).flatten()) + with autocast(is_mixed_precision): + output = net(image)['out'] + output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) + conf_mat.update(target.flatten(), output.argmax(1).flatten()) acc_global, acc, iu = conf_mat.compute() print(categories) @@ -385,9 +393,11 @@ def after_loading(): exp_name = args.exp_name with open(exp_name + '_cfg.txt', 'w') as f: f.write(str(vars(args))) - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator() + device = accelerator.device if args.coco: # This Caffe pre-trained model takes "inhuman" mean/std & input format mean = coco_mean std = coco_std @@ -423,8 +433,6 @@ def after_loading(): ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') # Just to be safe (a little bit more memory, by all means, save it to disk if you want) if args.state == 1: @@ -432,6 +440,7 @@ def after_loading(): # Testing if args.state == 3: + net, optimizer = accelerator.prepare(net, optimizer) test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo, state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug, input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id, @@ -439,7 +448,7 @@ def after_loading(): load_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision, filename=args.continue_from) test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes, - output_size=input_sizes[2]) + output_size=input_sizes[2], is_mixed_precision=args.mixed_precision) else: x = 0 criterion = DynamicMutualLoss(gamma1=args.gamma1, gamma2=args.gamma2, ignore_index=255) @@ -455,6 +464,7 @@ def after_loading(): mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader) x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader, device=device, criterion=criterion, net=net, optimizer=optimizer, lr_scheduler=lr_scheduler, @@ -471,10 +481,12 @@ def after_loading(): state=0, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer = accelerator.prepare(net, optimizer) time_now = time.time() ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader, input_size=input_sizes[2], - label_ratio=args.label_ratio, num_classes=num_classes) + label_ratio=args.label_ratio, num_classes=num_classes, + is_mixed_precision=args.mixed_precision) print(ratio) print('Pseudo labeling time: %.2fs' % (time.time() - time_now)) else: @@ -484,6 +496,9 @@ def after_loading(): state=1, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer, + labeled_loader, + pseudo_labeled_loader) x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader, validation_loader=val_loader, lr_scheduler=lr_scheduler, device=device, criterion=criterion, net=net, optimizer=optimizer, diff --git a/segmentation/utils/common.py b/segmentation/utils/common.py index 0744084..36d6146 100644 --- a/segmentation/utils/common.py +++ b/segmentation/utils/common.py @@ -2,9 +2,9 @@ import matplotlib.pyplot as plt import numpy as np from tqdm import tqdm -from apex import amp import time from utils.functional import crop +from torch.cuda.amp import autocast, GradScaler # Base directories base_voc = '../../voc_seg_deeplab/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012' @@ -119,7 +119,7 @@ def save_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename=' 'model': net.state_dict(), 'optimizer': optimizer.state_dict() if optimizer is not None else None, 'lr_scheduler': lr_scheduler.state_dict() if lr_scheduler is not None else None, - 'amp': amp.state_dict() if is_mixed_precision else None + # 'amp': amp.state_dict() if is_mixed_precision else None } torch.save(checkpoint, filename) @@ -132,11 +132,12 @@ def load_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename): optimizer.load_state_dict(checkpoint['optimizer']) if lr_scheduler is not None: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - if is_mixed_precision and checkpoint['amp'] is not None: - amp.load_state_dict(checkpoint['amp']) + # if is_mixed_precision and checkpoint['amp'] is not None: + # amp.load_state_dict(checkpoint['amp']) -def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_thresholds=None): +def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_thresholds=None, + is_mixed_precision=False): # Generate pseudo labels and save to disk (negligible time compared with training) # Not very sure if there are any cache inconsistency issues (technically this should be fine) net.eval() @@ -151,8 +152,9 @@ def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_th with torch.no_grad(): for images, file_name_lists, heights, widths in tqdm(loader): images = images.to(device) - outputs = net(images)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True) + with autocast(is_mixed_precision): + outputs = net(images)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True) # Generate pseudo labels (d1 x d2 x 2) for i in range(0, len(file_name_lists)): @@ -180,7 +182,7 @@ def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_th # Reimplemented (all converted to tensor ops) based on yzou2/CRST def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_classes, input_size, - down_sample_rate=16, buffer_size=100): + down_sample_rate=16, buffer_size=100, is_mixed_precision=False): # Max memory usage surge ratio has an upper limit of 2x (caused by array concatenation). # Keep a fixed GPU buffer size to achieve a good enough speed-memory trade-off, # since casting to cpu is very slow. @@ -201,8 +203,9 @@ def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_ with torch.no_grad(): for images, _, heights, widths in tqdm(loader): images = images.to(device) - outputs = net(images)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True) + with autocast(is_mixed_precision): + outputs = net(images)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True) # Generate pseudo labels (d1 x d2) and reassemble for i in range(0, len(heights)): @@ -246,4 +249,4 @@ def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_ print(kc) return generate_pseudo_labels(net=net, device=device, loader=loader, cbst_thresholds=torch.tensor(kc), - input_size=input_size, num_classes=num_classes) + input_size=input_size, num_classes=num_classes, is_mixed_precision=is_mixed_precision) diff --git a/segmentation/utils/transforms.py b/segmentation/utils/transforms.py index d6ec7bb..48ea7b1 100644 --- a/segmentation/utils/transforms.py +++ b/segmentation/utils/transforms.py @@ -159,7 +159,7 @@ def label_to_tensor(pic): # 3 dimensional arrays or normal segmentation masks if isinstance(pic, np.ndarray): return torch.as_tensor(pic.transpose((2, 0, 1)), dtype=torch.float32) else: - return torch.as_tensor(np.asarray(pic), dtype=torch.int64) + return torch.as_tensor(np.asarray(pic).copy(), dtype=torch.int64) def _pil_to_tensor(self, pic): # Convert a PIL Image to tensor(a direct copy) From 223a10b6e27d98d86fd260701327265ee74f306e Mon Sep 17 00:00:00 2001 From: voldemortX Date: Thu, 3 Jun 2021 19:12:23 +0800 Subject: [PATCH 3/8] update pip --- README.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1b59c3e..1e0462b 100644 --- a/README.md +++ b/README.md @@ -38,22 +38,16 @@ Also, thanks to [**@lorenmt**](https://github.com/lorenmt), a data augmentation You'll need a CUDA 10, Python3 environment (best on Linux) with PyTorch 1.2.0, TorchVision 0.4.0 and Apex to run the code in this repo. -### 1. Setup the exact version of Apex & PyTorch & TorchVision for mixed precision training: +### 1. Setup PyTorch & TorchVision: ``` -pip install https://download.pytorch.org/whl/cu100/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl && pip install https://download.pytorch.org/whl/cu100/torchvision-0.4.0-cp36-cp36m-manylinux1_x86_64.whl -git clone https://github.com/NVIDIA/apex -cd apex -pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ +pip install torch==1.6.0 torchvision==0.7.0 ``` -!There seems to be an issue of apex installations from the official repo sometimes. If you encounter errors, we suggest you use our stored older apex [codes](https://drive.google.com/open?id=1x8enpvdTTZ3RChf17XvcLdSYulUPg3sR). - -**PyTorch 1.6** now includes automatic mixed precision at apex level "O1". We probably will update this repo accordingly in the future. ### 2. Install other python packages you may require: ``` -pip install future matplotlib tensorboard tqdm +pip install packages accelerate future matplotlib tensorboard tqdm ``` ### 3. Download the code and prepare the scripts: From 01815ea927122ee0104cac462adb563c52c90466 Mon Sep 17 00:00:00 2001 From: voldemortX Date: Thu, 3 Jun 2021 20:48:30 +0800 Subject: [PATCH 4/8] split batches --- segmentation/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segmentation/main.py b/segmentation/main.py index 713a654..67aff45 100644 --- a/segmentation/main.py +++ b/segmentation/main.py @@ -396,7 +396,7 @@ def after_loading(): # device = torch.device('cpu') # if torch.cuda.is_available(): # device = torch.device('cuda:0') - accelerator = Accelerator() + accelerator = Accelerator(split_batches=True) device = accelerator.device if args.coco: # This Caffe pre-trained model takes "inhuman" mean/std & input format mean = coco_mean From 218633e2a878698b9afcba554bfb281b76bd7f78 Mon Sep 17 00:00:00 2001 From: voldemortX Date: Mon, 7 Jun 2021 13:38:31 +0800 Subject: [PATCH 5/8] modify other segmentation codes --- segmentation/main.py | 4 +-- segmentation/main_flip.py | 68 ++++++++++++++++++++++-------------- segmentation/main_naive.py | 65 ++++++++++++++++++++-------------- segmentation/main_online.py | 64 ++++++++++++++++++++------------- segmentation/utils/common.py | 2 +- 5 files changed, 122 insertions(+), 81 deletions(-) diff --git a/segmentation/main.py b/segmentation/main.py index 67aff45..56b11e7 100644 --- a/segmentation/main.py +++ b/segmentation/main.py @@ -241,7 +241,7 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(loader_c) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net, num_classes=num_classes, categories=categories, output_size=input_sizes[2], @@ -335,7 +335,7 @@ def after_loading(): if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='voc', diff --git a/segmentation/main_flip.py b/segmentation/main_flip.py index 61bf1cb..fb21b65 100644 --- a/segmentation/main_flip.py +++ b/segmentation/main_flip.py @@ -14,11 +14,12 @@ from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \ ZeroPad, Compose, RandomScale from torch.utils.tensorboard import SummaryWriter -from apex import amp from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \ num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \ load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city from utils.losses import FlipLoss as DynamicMutualLoss +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean, @@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne if with_sup: iter_sup = iter(loader_sup) + if is_mixed_precision: + scaler = GradScaler() + # Training running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0} while epoch < num_epochs: @@ -202,22 +206,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() - outputs = net(inputs)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) - conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) + with autocast(is_mixed_precision): + outputs = net(inputs)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) + conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) - if with_sup: - loss, stats = criterion(outputs, probs, inputs_c.shape[0]) - else: - loss, stats = criterion(outputs, labels) + if with_sup: + loss, stats = criterion(outputs, probs, inputs_c.shape[0]) + else: + loss, stats = criterion(outputs, labels) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() - optimizer.step() + accelerator.backward(loss) + optimizer.step() lr_scheduler.step() # Logging @@ -235,10 +240,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(loader_c) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net, num_classes=num_classes, categories=categories, - output_size=input_sizes[2]) + output_size=input_sizes[2], + is_mixed_precision=is_mixed_precision) writer.add_scalar(tensorboard_prefix + 'test pixel accuracy', test_pixel_accuracy, current_step_num) @@ -282,16 +288,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Copied and modified from torch/vision/references/segmentation -def test_one_set(loader, device, net, categories, num_classes, output_size): +def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision): # Evaluate on 1 data_loader (cudnn impact < 0.003%) net.eval() conf_mat = ConfusionMatrix(num_classes) with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image)['out'] - output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) - conf_mat.update(target.flatten(), output.argmax(1).flatten()) + with autocast(is_mixed_precision): + output = net(image)['out'] + output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) + conf_mat.update(target.flatten(), output.argmax(1).flatten()) acc_global, acc, iu = conf_mat.compute() print(categories) @@ -327,7 +334,7 @@ def after_loading(): if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='voc', @@ -385,9 +392,11 @@ def after_loading(): exp_name = args.exp_name with open(exp_name + '_cfg.txt', 'w') as f: f.write(str(vars(args))) - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator(split_batches=True) + device = accelerator.device if args.coco: # This Caffe pre-trained model takes "inhuman" mean/std & input format mean = coco_mean std = coco_std @@ -423,8 +432,6 @@ def after_loading(): ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') # Just to be safe (a little bit more memory, by all means, save it to disk if you want) if args.state == 1: @@ -432,6 +439,7 @@ def after_loading(): # Testing if args.state == 3: + net, optimizer = accelerator.prepare(net, optimizer) test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo, state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug, input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id, @@ -439,7 +447,7 @@ def after_loading(): load_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision, filename=args.continue_from) test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes, - output_size=input_sizes[2]) + output_size=input_sizes[2], is_mixed_precision=args.mixed_precision) else: x = 0 criterion = DynamicMutualLoss(gamma1=args.gamma1, gamma2=args.gamma2, ignore_index=255) @@ -455,6 +463,7 @@ def after_loading(): mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader) x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader, device=device, criterion=criterion, net=net, optimizer=optimizer, lr_scheduler=lr_scheduler, @@ -471,10 +480,12 @@ def after_loading(): state=0, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer = accelerator.prepare(net, optimizer) time_now = time.time() ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader, input_size=input_sizes[2], - label_ratio=args.label_ratio, num_classes=num_classes) + label_ratio=args.label_ratio, num_classes=num_classes, + is_mixed_precision=args.mixed_precision) print(ratio) print('Pseudo labeling time: %.2fs' % (time.time() - time_now)) else: @@ -484,6 +495,9 @@ def after_loading(): state=1, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer, + labeled_loader, + pseudo_labeled_loader) x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader, validation_loader=val_loader, lr_scheduler=lr_scheduler, device=device, criterion=criterion, net=net, optimizer=optimizer, diff --git a/segmentation/main_naive.py b/segmentation/main_naive.py index e194cc5..88c0cde 100644 --- a/segmentation/main_naive.py +++ b/segmentation/main_naive.py @@ -14,11 +14,12 @@ from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \ ZeroPad, Compose, RandomScale from torch.utils.tensorboard import SummaryWriter -from apex import amp from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \ num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \ load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city from utils.losses import DynamicNaiveLoss as DynamicMutualLoss +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean, @@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne if with_sup: iter_sup = iter(loader_sup) + if is_mixed_precision: + scaler = GradScaler() + # Training running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0} while epoch < num_epochs: @@ -202,22 +206,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() - outputs = net(inputs)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) - conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) + with autocast(is_mixed_precision): + outputs = net(inputs)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) + conf_mat.update(labels.flatten(), outputs.argmax(1).flatten()) - if with_sup: - loss, stats = criterion(outputs, probs, inputs_c.shape[0]) - else: - loss, stats = criterion(outputs, labels) + if with_sup: + loss, stats = criterion(outputs, probs, inputs_c.shape[0]) + else: + loss, stats = criterion(outputs, labels) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() - optimizer.step() + accelerator.backward(loss) + optimizer.step() lr_scheduler.step() # Logging @@ -235,10 +240,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(loader_c) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net, num_classes=num_classes, categories=categories, - output_size=input_sizes[2]) + output_size=input_sizes[2], + is_mixed_precision=is_mixed_precision) writer.add_scalar(tensorboard_prefix + 'test pixel accuracy', test_pixel_accuracy, current_step_num) @@ -282,16 +288,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Copied and modified from torch/vision/references/segmentation -def test_one_set(loader, device, net, categories, num_classes, output_size): +def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision): # Evaluate on 1 data_loader (cudnn impact < 0.003%) net.eval() conf_mat = ConfusionMatrix(num_classes) with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image)['out'] - output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) - conf_mat.update(target.flatten(), output.argmax(1).flatten()) + with autocast(is_mixed_precision): + output = net(image)['out'] + output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) + conf_mat.update(target.flatten(), output.argmax(1).flatten()) acc_global, acc, iu = conf_mat.compute() print(categories) @@ -327,7 +334,7 @@ def after_loading(): if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='voc', @@ -385,9 +392,11 @@ def after_loading(): exp_name = args.exp_name with open(exp_name + '_cfg.txt', 'w') as f: f.write(str(vars(args))) - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator(split_batches=True) + device = accelerator.device if args.coco: # This Caffe pre-trained model takes "inhuman" mean/std & input format mean = coco_mean std = coco_std @@ -423,8 +432,6 @@ def after_loading(): ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') # Just to be safe (a little bit more memory, by all means, save it to disk if you want) if args.state == 1: @@ -432,6 +439,7 @@ def after_loading(): # Testing if args.state == 3: + net, optimizer = accelerator.prepare(net, optimizer) test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo, state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug, input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id, @@ -439,7 +447,7 @@ def after_loading(): load_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision, filename=args.continue_from) test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes, - output_size=input_sizes[2]) + output_size=input_sizes[2], is_mixed_precision=args.mixed_precision) else: x = 0 criterion = DynamicMutualLoss(ignore_index=255) @@ -455,6 +463,7 @@ def after_loading(): mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader) x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader, device=device, criterion=criterion, net=net, optimizer=optimizer, lr_scheduler=lr_scheduler, @@ -471,6 +480,7 @@ def after_loading(): state=0, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer = accelerator.prepare(net, optimizer) time_now = time.time() ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader, input_size=input_sizes[2], @@ -484,6 +494,9 @@ def after_loading(): state=1, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer, + labeled_loader, + pseudo_labeled_loader) x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader, validation_loader=val_loader, lr_scheduler=lr_scheduler, device=device, criterion=criterion, net=net, optimizer=optimizer, diff --git a/segmentation/main_online.py b/segmentation/main_online.py index b6ccefc..d611598 100644 --- a/segmentation/main_online.py +++ b/segmentation/main_online.py @@ -14,11 +14,12 @@ from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \ ZeroPad, RandomScale, Compose from torch.utils.tensorboard import SummaryWriter -from apex import amp from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \ num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \ load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city from utils.losses import OnlineLoss as DynamicMutualLoss +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean, @@ -173,6 +174,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne if with_sup: iter_sup = iter(loader_sup) + if is_mixed_precision: + scaler = GradScaler() + # Training running_stats = {'loss': 0.0} while epoch < num_epochs: @@ -209,21 +213,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne labels_sup = labels_sup.to(device) # labels = labels.to(device) optimizer.zero_grad() - outputs = net(inputs)['out'] - outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) - conf_mat.update(labels_sup[inputs_c.shape[0]:].flatten(), outputs[inputs_c.shape[0]:].argmax(1).flatten()) + with autocast(is_mixed_precision): + outputs = net(inputs)['out'] + outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True) + conf_mat.update(labels_sup[inputs_c.shape[0]:].flatten(), outputs[inputs_c.shape[0]:].argmax(1).flatten()) - if with_sup: - loss, stats = criterion(outputs, labels_sup, inputs_c.shape[0]) - else: - loss, stats = criterion(outputs, labels_sup) + if with_sup: + loss, stats = criterion(outputs, labels_sup, inputs_c.shape[0]) + else: + loss, stats = criterion(outputs, labels_sup) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() + accelerator.backward(loss) + optimizer.step() optimizer.step() lr_scheduler.step() @@ -242,10 +248,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(loader_c) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net, num_classes=num_classes, categories=categories, - output_size=input_sizes[2]) + output_size=input_sizes[2], + is_mixed_precision=is_mixed_precision) writer.add_scalar(tensorboard_prefix + 'test pixel accuracy', test_pixel_accuracy, current_step_num) @@ -289,16 +296,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne # Copied and modified from torch/vision/references/segmentation -def test_one_set(loader, device, net, categories, num_classes, output_size): +def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision): # Evaluate on 1 data_loader (cudnn impact < 0.003%) net.eval() conf_mat = ConfusionMatrix(num_classes) with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image)['out'] - output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) - conf_mat.update(target.flatten(), output.argmax(1).flatten()) + with autocast(is_mixed_precision): + output = net(image)['out'] + output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True) + conf_mat.update(target.flatten(), output.argmax(1).flatten()) acc_global, acc, iu = conf_mat.compute() print(categories) @@ -334,7 +342,7 @@ def after_loading(): if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='voc', @@ -392,9 +400,11 @@ def after_loading(): exp_name = args.exp_name with open(exp_name + '_cfg.txt', 'w') as f: f.write(str(vars(args))) - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator(split_batches=True) + device = accelerator.device if args.coco: # This Caffe pre-trained model takes "inhuman" mean/std & input format mean = coco_mean std = coco_std @@ -430,8 +440,6 @@ def after_loading(): ] optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') # Just to be safe (a little bit more memory, by all means, save it to disk if you want) if args.state == 1: @@ -439,6 +447,7 @@ def after_loading(): # Testing if args.state == 3: + net, optimizer = accelerator.prepare(net, optimizer) test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo, state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug, input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id, @@ -446,7 +455,7 @@ def after_loading(): load_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision, filename=args.continue_from) test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes, - output_size=input_sizes[2]) + output_size=input_sizes[2], is_mixed_precision=args.mixed_precision) else: x = 0 criterion = DynamicMutualLoss(ignore_index=255) @@ -462,6 +471,7 @@ def after_loading(): mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader) x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader, device=device, criterion=criterion, net=net, optimizer=optimizer, lr_scheduler=lr_scheduler, @@ -478,6 +488,7 @@ def after_loading(): state=0, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer = accelerator.prepare(net, optimizer) time_now = time.time() generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader, input_size=input_sizes[0], @@ -490,6 +501,9 @@ def after_loading(): state=1, split=args.train_set, input_sizes=input_sizes, sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels) after_loading() + net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer, + labeled_loader, + pseudo_labeled_loader) x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader, validation_loader=val_loader, lr_scheduler=lr_scheduler, device=device, criterion=criterion, net=net, optimizer=optimizer, diff --git a/segmentation/utils/common.py b/segmentation/utils/common.py index 36d6146..2266ac4 100644 --- a/segmentation/utils/common.py +++ b/segmentation/utils/common.py @@ -4,7 +4,7 @@ from tqdm import tqdm import time from utils.functional import crop -from torch.cuda.amp import autocast, GradScaler +from torch.cuda.amp import autocast # Base directories base_voc = '../../voc_seg_deeplab/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012' From 68565d80716cd9a8e0c03acc1462b3092fbef33a Mon Sep 17 00:00:00 2001 From: voldemortX Date: Mon, 7 Jun 2021 13:39:10 +0800 Subject: [PATCH 6/8] upgrade classification codes for multi-GPU --- classification/main_dmt.py | 63 +++++++++++++++++++++------------- classification/main_fs.py | 46 +++++++++++++++---------- classification/utils/common.py | 13 ++++--- 3 files changed, 75 insertions(+), 47 deletions(-) diff --git a/classification/main_dmt.py b/classification/main_dmt.py index bf67b04..7a554b5 100644 --- a/classification/main_dmt.py +++ b/classification/main_dmt.py @@ -7,7 +7,6 @@ import numpy as np from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter -from apex import amp from models.wideresnet import wrn_28_2 from utils.common import num_classes_cifar10, mean_cifar10, std_cifar10, input_sizes_cifar10, base_cifar10, \ load_checkpoint, save_checkpoint, EMA, rank_label_confidence @@ -18,6 +17,8 @@ from utils.randomrandaugment import RandomRandAugment from utils.cutout import Cutout from utils.autoaugment import CIFAR10Policy +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def get_transforms(auto_augment, input_sizes, m, mean, n, std): @@ -48,8 +49,9 @@ def get_transforms(auto_augment, input_sizes, m, mean, n, std): return test_transforms, train_transforms -def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filename): - k = rank_label_confidence(net=net, device=device, loader=loader, ratio=label_ratio, num_images=num_images) +def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filename, is_mixed_precision): + k = rank_label_confidence(net=net, device=device, loader=loader, ratio=label_ratio, num_images=num_images, + is_mixed_precision=is_mixed_precision) print(k) # 1 forward pass (build pickle file) selected_files = None @@ -59,9 +61,10 @@ def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filenam for images, original_file in tqdm(loader): # Inference images = images.to(device) - outputs = net(images) - temp = torch.nn.functional.softmax(input=outputs, dim=-1) # ! softmax - pseudo_probabilities = temp.max(dim=-1).values + with autocast(is_mixed_precision): + outputs = net(images) + temp = torch.nn.functional.softmax(input=outputs, dim=-1) # ! softmax + pseudo_probabilities = temp.max(dim=-1).values # Select temp_predictions = temp[pseudo_probabilities > k].cpu().numpy() @@ -110,7 +113,7 @@ def init(mean, std, input_sizes, base, num_workers, prefix, val_set, train, batc return labeled_loader, unlabeled_loader, pseudo_labeled_loader, val_loader, unlabeled_set.__len__() -def test(loader, device, net, fine_grain=False): +def test(loader, device, net, fine_grain=False, is_mixed_precision=False): # Evaluate net.eval() test_correct = 0 @@ -119,7 +122,8 @@ def test(loader, device, net, fine_grain=False): with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image) + with autocast(is_mixed_precision): + output = net(image) test_all += target.shape[0] if fine_grain: predictions = output.softmax(1) @@ -154,6 +158,9 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri if val_num_steps is None: val_num_steps = min_len + if is_mixed_precision: + scaler = GradScaler() + net.train() # Use EMA to report final performance instead of select best checkpoint with valtiny @@ -203,7 +210,8 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri split_index=inputs_pseudo.shape[0], labeled_weight=labeled_weight) inputs, dynamic_weights, labels_a, labels_b, lam = mixup_data(x=inputs, w=dynamic_weights, y=labels, alpha=alpha, keep_max=True) - outputs = net(inputs) + with autocast(is_mixed_precision): + outputs = net(inputs) if alpha != -1: # Pseudo training accuracy & interesting loss @@ -218,12 +226,12 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri gamma1=gamma1, gamma2=gamma2) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() - optimizer.step() + accelerator.backward(loss) + optimizer.step() criterion.step() if lr_scheduler is not None: lr_scheduler.step() @@ -252,8 +260,9 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(pseudo_labeled_loader) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 - test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain) + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC + test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain, + is_mixed_precision=is_mixed_precision) writer.add_scalar(tensorboard_prefix + 'test accuracy', test_acc, current_step_num) @@ -284,7 +293,7 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='cifar10', @@ -350,9 +359,11 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri # torch.backends.cudnn.benchmark = False # Might hurt performance if args.exp_name != 'auto': exp_name = args.exp_name - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator(split_batches=True) + device = accelerator.device if args.valtiny: val_set = 'valtiny_seed1' else: @@ -373,8 +384,6 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri params_to_optimize = net.parameters() optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(params_to_optimize, lr=args.lr, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') if args.continue_from is not None: load_checkpoint(net=net, optimizer=None, lr_scheduler=None, @@ -385,13 +394,18 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri dataset=args.dataset, n=args.n, m=args.m, auto_augment=args.aa, input_sizes=input_sizes, std=std, num_workers=args.num_workers, batch_size_pseudo=args.batch_size_pseudo, train=False if args.labeling else True) + net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer, + labeled_loader, + pseudo_labeled_loader) + # Pseudo labeling if args.labeling: time_now = time.time() sub_base = CIFAR10.base_folder filename = os.path.join(base, sub_base, args.train_set + '_pseudo') generate_pseudo_labels(net=net, device=device, loader=unlabeled_loader, filename=filename, - label_ratio=args.label_ratio, num_images=num_images) + label_ratio=args.label_ratio, num_images=num_images, + is_mixed_precision=args.mixed_precision) print('Pseudo labeling time: %.2fs' % (time.time() - time_now)) else: # Mutual-training @@ -402,7 +416,8 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri T_max=args.epochs * len(pseudo_labeled_loader)) writer = SummaryWriter('logs/' + exp_name) - best_acc = test(loader=val_loader, device=device, net=net, fine_grain=args.fine_grain) + best_acc = test(loader=val_loader, device=device, net=net, fine_grain=args.fine_grain, + is_mixed_precision=args.mixed_precision) save_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision) print('Original acc: ' + str(best_acc)) diff --git a/classification/main_fs.py b/classification/main_fs.py index fa03bea..6613bcf 100644 --- a/classification/main_fs.py +++ b/classification/main_fs.py @@ -7,7 +7,6 @@ from tqdm import tqdm from torch.utils.tensorboard import SummaryWriter from torchvision.transforms import Compose, RandomCrop, RandomHorizontalFlip, Normalize, ToTensor -from apex import amp from utils.randomrandaugment import RandomRandAugment from models.wideresnet import wrn_28_2 from utils.common import num_classes_cifar10, mean_cifar10, std_cifar10, input_sizes_cifar10, base_cifar10, \ @@ -16,6 +15,8 @@ from utils.autoaugment import CIFAR10Policy from utils.datasets import CIFAR10 from utils.mixup import mixup_criterion, mixup_data +from accelerate import Accelerator +from torch.cuda.amp import autocast, GradScaler def init(batch_size, state, mean, std, input_sizes, base, num_workers, train_set, val_set, rand_augment=True, @@ -88,6 +89,9 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l if val_num_steps is None: val_num_steps = len(train_loader) + if is_mixed_precision: + scaler = GradScaler() + net.train() # Use EMA to report final performance instead of select best checkpoint with valtiny @@ -112,7 +116,8 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l if alpha is not None: inputs, labels_a, labels_b, lam = mixup_data(x=inputs, y=labels, alpha=alpha) - outputs = net(inputs) + with autocast(is_mixed_precision): + outputs = net(inputs) if alpha is not None: # Pseudo training accuracy & interesting loss @@ -125,12 +130,12 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l loss = criterion(outputs, labels) if is_mixed_precision: - # 2/3 & 3/3 of mixed precision training with amp - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + accelerator.backward(scaler.scale(loss)) + scaler.step(optimizer) + scaler.update() else: - loss.backward() - optimizer.step() + accelerator.backward(loss) + optimizer.step() if lr_scheduler is not None: lr_scheduler.step() @@ -150,8 +155,9 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l # Validate and find the best snapshot if current_step_num % val_num_steps == (val_num_steps - 1) or \ current_step_num == num_epochs * len(train_loader) - 1: - # A bug in Apex? https://github.com/NVIDIA/apex/issues/706 - test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain) + # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC + test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain, + is_mixed_precision=is_mixed_precision) writer.add_scalar('test accuracy', test_acc, current_step_num) @@ -180,7 +186,7 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l return best_acc -def test(loader, device, net, fine_grain=False): +def test(loader, device, net, fine_grain=False, is_mixed_precision=False): # Evaluate net.eval() test_correct = 0 @@ -189,7 +195,8 @@ def test(loader, device, net, fine_grain=False): with torch.no_grad(): for image, target in tqdm(loader): image, target = image.to(device), target.to(device) - output = net(image) + with autocast(is_mixed_precision): + output = net(image) test_all += target.shape[0] if fine_grain: predictions = output.softmax(1) @@ -213,7 +220,7 @@ def test(loader, device, net, fine_grain=False): if __name__ == '__main__': # Settings - parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0') + parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0') parser.add_argument('--exp-name', type=str, default='auto', help='Name of the experiment (default: auto)') parser.add_argument('--dataset', type=str, default='cifar10', @@ -269,9 +276,11 @@ def test(loader, device, net, fine_grain=False): # torch.backends.cudnn.benchmark = False # Might hurt performance if args.exp_name != 'auto': exp_name = args.exp_name - device = torch.device('cpu') - if torch.cuda.is_available(): - device = torch.device('cuda:0') + # device = torch.device('cpu') + # if torch.cuda.is_available(): + # device = torch.device('cuda:0') + accelerator = Accelerator(split_batches=True) + device = accelerator.device if args.valtiny: val_set = 'valtiny_seed1' else: @@ -293,16 +302,16 @@ def test(loader, device, net, fine_grain=False): params_to_optimize = net.parameters() optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) # optimizer = torch.optim.Adam(params_to_optimize, lr=args.lr, weight_decay=args.weight_decay) - if args.mixed_precision: - net, optimizer = amp.initialize(net, optimizer, opt_level='O1') # Testing if args.state == 2: + net, optimizer = accelerator.prepare(net, optimizer) test_loader = init(batch_size=args.batch_size, state=2, mean=mean, std=std, train_set=None, val_set=val_set, input_sizes=input_sizes, base=base, num_workers=args.num_workers, dataset=args.dataset) load_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision, filename=args.continue_from) - x = test(loader=test_loader, device=device, net=net, fine_grain=args.fine_grain) + x = test(loader=test_loader, device=device, net=net, fine_grain=args.fine_grain, + is_mixed_precision=args.mixed_precision) with open(args.log + '.txt', 'a') as f: f.write('test: ' + str(x) + '\n') @@ -315,6 +324,7 @@ def test(loader, device, net, fine_grain=False): train_set=args.train_set, val_set=val_set, n=args.n, m=args.m, rand_augment=args.ra, input_sizes=input_sizes, num_workers=args.num_workers, dataset=args.dataset) + net, optimizer, train_loader = accelerator.prepare(net, optimizer, train_loader) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=args.epochs * len(train_loader)) # lr_scheduler = None diff --git a/classification/utils/common.py b/classification/utils/common.py index 9e901a2..c431f5b 100644 --- a/classification/utils/common.py +++ b/classification/utils/common.py @@ -1,9 +1,11 @@ import torch import numpy as np import matplotlib.pyplot as plt -from apex import amp from tqdm import tqdm import collections +from accelerate import Accelerator +from torch.cuda.amp import autocast + mean_cifar10 = [0.49137, 0.48235, 0.44667] std_cifar10 = [0.24706, 0.24353, 0.26157] @@ -49,7 +51,7 @@ def load_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename): # Count for threshold (k) to select top confident labels -def rank_label_confidence(net, device, loader, ratio, num_images): +def rank_label_confidence(net, device, loader, ratio, num_images, is_mixed_precision): net.eval() if ratio >= 1: k = 0 @@ -61,9 +63,10 @@ def rank_label_confidence(net, device, loader, ratio, num_images): for images, _ in tqdm(loader): # Inference images = images.to(device) - outputs = net(images) - temp = torch.nn.functional.softmax(input=outputs, dim=1) # ! softmax - pseudo_probabilities = temp.max(dim=1).values + with autocast(is_mixed_precision): + outputs = net(images) + temp = torch.nn.functional.softmax(input=outputs, dim=1) # ! softmax + pseudo_probabilities = temp.max(dim=1).values temp_len = pseudo_probabilities.shape[0] # Count From d38e19284908b43f5de64892c8ac803064c1fb0a Mon Sep 17 00:00:00 2001 From: voldemortX Date: Mon, 7 Jun 2021 13:49:54 +0800 Subject: [PATCH 7/8] doc --- CLASSIFICATION.md | 14 +++++++++++--- README.md | 9 ++++++--- SEGMENTATION.md | 12 ++++++++++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/CLASSIFICATION.md b/CLASSIFICATION.md index 8c1b92f..e6d2da0 100644 --- a/CLASSIFICATION.md +++ b/CLASSIFICATION.md @@ -34,7 +34,15 @@ The CIFAR-10 dataset can be downloaded and splitted to 5 random splits and valid ## Run the code -We provide examples in scripts and commands. Final results can be found at log.txt after training. +For multi-GPU/TPU/Distributed machine users, first run: + +``` +accelerate config +``` + +More details can be found at [Accelerate](https://github.com/huggingface/accelerate). Note that the mixed precision config cannot be used, you should still use `--mixed-precision` for that. + +We provide examples in scripts and commands. Final results can be found at `log.txt` after training. For example, with 1000 labels, to compare CL and DMT in a controlled experiment with same baseline model to start training: @@ -43,6 +51,6 @@ For example, with 1000 labels, to compare CL and DMT in a controlled experiment ./ss-dmt-full-1.sh ``` -Of course you'll need to run 5 times average to determine performance by changing the *seed* parameter (we used 1,2,3,4,5) in shell scripts. +You'll need to run 5 times average to determine performance by changing the `seed` parameter (we used 1,2,3,4,5) in shell scripts. -For small validation set, use *--valtiny*; for fine-grained testing, use *--fine-grain*. +For small validation set, use `--valtiny`; for fine-grained testing, use `--fine-grain`. diff --git a/README.md b/README.md index 1e0462b..bec9d9b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema ## News -### 2021.6.3 +### 2021.6.7 **Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6. Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang). @@ -36,7 +36,7 @@ Also, thanks to [**@lorenmt**](https://github.com/lorenmt), a data augmentation ## Setup -You'll need a CUDA 10, Python3 environment (best on Linux) with PyTorch 1.2.0, TorchVision 0.4.0 and Apex to run the code in this repo. +First, you'll need a CUDA 10, Python3 environment (best on Linux). ### 1. Setup PyTorch & TorchVision: @@ -47,7 +47,7 @@ pip install torch==1.6.0 torchvision==0.7.0 ### 2. Install other python packages you may require: ``` -pip install packages accelerate future matplotlib tensorboard tqdm +pip install packaging accelerate future matplotlib tensorboard tqdm ``` ### 3. Download the code and prepare the scripts: @@ -66,9 +66,11 @@ Get started with [SEGMENTATION.md](SEGMENTATION.md) for semantic segmentation. Get started with [CLASSIFICATION.md](CLASSIFICATION.md) for image classification. ## Understand the code + We refer interested readers to this repository's [wiki](https://github.com/voldemortX/DST-CBC/wiki). *It is not updated for DMT yet.* ## Notes + It's best to use a **Turing** or **Volta** architecture GPU when running our code, since they have tensor cores and the computation speed is much faster with mixed precision. For instance, RTX 2080 Ti (which is what we used) or Tesla V100, RTX 20/30 series. Our implementation is fast and memory efficient. A whole run (train 2 models by DMT on PASCAL VOC 2012) takes about 8 hours on a single RTX 2080 Ti using up to 6GB graphic memory, including on-the-fly evaluations and training baselines. The Cityscapes experiments are even faster. @@ -98,3 +100,4 @@ The CBC part of the older version DST-CBC is adapted from [CRST](https://github. The overall implementation is based on [TorchVision](https://github.com/pytorch/vision) and [PyTorch](https://github.com/pytorch/pytorch). +The people who've helped to make the method & code better: [**lorenmt**](https://github.com/lorenmt), [**jinhuan-hit**](https://github.com/jinhuan-hit), [**TiankaiHang**](https://github.com/TiankaiHang), etc. diff --git a/SEGMENTATION.md b/SEGMENTATION.md index 748f25a..c33a7d0 100644 --- a/SEGMENTATION.md +++ b/SEGMENTATION.md @@ -97,6 +97,14 @@ ImageNet pre-trained weights will be automatically downloaded when running code. ## Run the code +For multi-GPU/TPU/Distributed machine users, first run: + +``` +accelerate config +``` + +More details can be found at [Accelerate](https://github.com/huggingface/accelerate). Note that the mixed precision config cannot be used, you should still use `--mixed-precision` for that. + We provide examples in scripts and commands. Final results can be found at log.txt after training. For example, run DMT with different pre-trained weights: @@ -120,8 +128,8 @@ python pascal_sbd_split.py ``` -Of course you'll need to run 3 times average to determine performance by changing the *sid* parameter (we used 0,1,2) in shell scripts. +Of course you'll need to run 3 times average to determine performance by changing the `sid` parameter (we used 0,1,2) in shell scripts. We also provide scripts for ablations, be sure to run *abl_baseline.sh* first. -For small validation set, use *--valtiny*. +For small validation set, use `--valtiny`. From 9594c90aad6fa6a85ae2b13c2ff3a519bd8cfcfc Mon Sep 17 00:00:00 2001 From: voldemortX Date: Mon, 7 Jun 2021 13:57:09 +0800 Subject: [PATCH 8/8] notes --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bec9d9b..2789b00 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ This repository contains the code for our paper [DMT: Dynamic Mutual Training fo Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Semantic Segmentation via Dynamic Self-Training and Class-Balanced Curriculum*, if you want the old code, you can check out the [dst-cbc](https://github.com/voldemortX/DST-CBC/tree/dst-cbc) branch. +Also, for older PyTorch version (<1.6.0) users, or the **exact** same environment that produced the paper's results, refer to 53853f6. +
@@ -12,7 +14,7 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema ### 2021.6.7 -**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6. +**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is upgraded to PyTorch 1.6. Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang). ### 2021.2.10