From d5d38963e8d5f535c55d927f6599bd5fb50ca667 Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Thu, 3 Jun 2021 19:01:19 +0800
Subject: [PATCH 1/8] doc

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 8423ea8..1b59c3e 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,11 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema
 
 ## News
 
+### 2021.6.3
+
+**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6.
+Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang).
+
 ### 2021.2.10
 
 A slight backbone architecture difference in the segmentation task has just been identified and described in Acknowledgement.

From 6bc50737d331510e018317244ea4a6437c776d79 Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Thu, 3 Jun 2021 19:01:39 +0800
Subject: [PATCH 2/8] upgrade for segmentation main

---
 segmentation/main.py             | 65 ++++++++++++++++++++------------
 segmentation/utils/common.py     | 25 ++++++------
 segmentation/utils/transforms.py |  2 +-
 3 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/segmentation/main.py b/segmentation/main.py
index b14c7fd..713a654 100644
--- a/segmentation/main.py
+++ b/segmentation/main.py
@@ -14,11 +14,12 @@
 from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \
     ZeroPad, Compose, RandomScale
 from torch.utils.tensorboard import SummaryWriter
-from apex import amp
 from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \
     num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \
     load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city
 from utils.losses import DynamicMutualLoss
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean,
@@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
     if with_sup:
         iter_sup = iter(loader_sup)
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     # Training
     running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0}
     while epoch < num_epochs:
@@ -202,22 +206,24 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             inputs = inputs.to(device)
             labels = labels.to(device)
             optimizer.zero_grad()
-            outputs = net(inputs)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
-            conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
+                conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
 
-            if with_sup:
-                loss, stats = criterion(outputs, probs, inputs_c.shape[0])
-            else:
-                loss, stats = criterion(outputs, labels)
+                if with_sup:
+                    loss, stats = criterion(outputs, probs, inputs_c.shape[0])
+                else:
+                    loss, stats = criterion(outputs, labels)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
-            optimizer.step()
+                accelerator.backward(loss)
+                optimizer.step()
+
             lr_scheduler.step()
 
             # Logging
@@ -238,7 +244,8 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
                 # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
                 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net,
                                                               num_classes=num_classes, categories=categories,
-                                                              output_size=input_sizes[2])
+                                                              output_size=input_sizes[2],
+                                                              is_mixed_precision=is_mixed_precision)
                 writer.add_scalar(tensorboard_prefix + 'test pixel accuracy',
                                   test_pixel_accuracy,
                                   current_step_num)
@@ -282,16 +289,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
 
 
 # Copied and modified from torch/vision/references/segmentation
-def test_one_set(loader, device, net, categories, num_classes, output_size):
+def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision):
     # Evaluate on 1 data_loader (cudnn impact < 0.003%)
     net.eval()
     conf_mat = ConfusionMatrix(num_classes)
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)['out']
-            output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
-            conf_mat.update(target.flatten(), output.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                output = net(image)['out']
+                output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
+                conf_mat.update(target.flatten(), output.argmax(1).flatten())
 
     acc_global, acc, iu = conf_mat.compute()
     print(categories)
@@ -385,9 +393,11 @@ def after_loading():
         exp_name = args.exp_name
     with open(exp_name + '_cfg.txt', 'w') as f:
         f.write(str(vars(args)))
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator()
+    device = accelerator.device
     if args.coco:  # This Caffe pre-trained model takes "inhuman" mean/std & input format
         mean = coco_mean
         std = coco_std
@@ -423,8 +433,6 @@ def after_loading():
     ]
 
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     # Just to be safe (a little bit more memory, by all means, save it to disk if you want)
     if args.state == 1:
@@ -432,6 +440,7 @@ def after_loading():
 
     # Testing
     if args.state == 3:
+        net, optimizer = accelerator.prepare(net, optimizer)
         test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo,
                            state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug,
                            input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id,
@@ -439,7 +448,7 @@ def after_loading():
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
                         is_mixed_precision=args.mixed_precision, filename=args.continue_from)
         test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes,
-                     output_size=input_sizes[2])
+                     output_size=input_sizes[2], is_mixed_precision=args.mixed_precision)
     else:
         x = 0
         criterion = DynamicMutualLoss(gamma1=args.gamma1, gamma2=args.gamma2, ignore_index=255)
@@ -455,6 +464,7 @@ def after_loading():
                                               mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug,
                                               reverse_channels=reverse_channels)
             after_loading()
+            net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader)
             x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader,
                       device=device, criterion=criterion, net=net, optimizer=optimizer,
                       lr_scheduler=lr_scheduler,
@@ -471,10 +481,12 @@ def after_loading():
                     state=0, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer = accelerator.prepare(net, optimizer)
                 time_now = time.time()
                 ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader,
                                                               input_size=input_sizes[2],
-                                                              label_ratio=args.label_ratio, num_classes=num_classes)
+                                                              label_ratio=args.label_ratio, num_classes=num_classes,
+                                                              is_mixed_precision=args.mixed_precision)
                 print(ratio)
                 print('Pseudo labeling time: %.2fs' % (time.time() - time_now))
             else:
@@ -484,6 +496,9 @@ def after_loading():
                     state=1, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer,
+                                                                                            labeled_loader,
+                                                                                            pseudo_labeled_loader)
                 x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader,
                           validation_loader=val_loader, lr_scheduler=lr_scheduler,
                           device=device, criterion=criterion, net=net, optimizer=optimizer,
diff --git a/segmentation/utils/common.py b/segmentation/utils/common.py
index 0744084..36d6146 100644
--- a/segmentation/utils/common.py
+++ b/segmentation/utils/common.py
@@ -2,9 +2,9 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from tqdm import tqdm
-from apex import amp
 import time
 from utils.functional import crop
+from torch.cuda.amp import autocast, GradScaler
 
 # Base directories
 base_voc = '../../voc_seg_deeplab/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012'
@@ -119,7 +119,7 @@ def save_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename='
         'model': net.state_dict(),
         'optimizer': optimizer.state_dict() if optimizer is not None else None,
         'lr_scheduler': lr_scheduler.state_dict() if lr_scheduler is not None else None,
-        'amp': amp.state_dict() if is_mixed_precision else None
+        # 'amp': amp.state_dict() if is_mixed_precision else None
     }
     torch.save(checkpoint, filename)
 
@@ -132,11 +132,12 @@ def load_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename):
         optimizer.load_state_dict(checkpoint['optimizer'])
     if lr_scheduler is not None:
         lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
-    if is_mixed_precision and checkpoint['amp'] is not None:
-        amp.load_state_dict(checkpoint['amp'])
+    # if is_mixed_precision and checkpoint['amp'] is not None:
+    #     amp.load_state_dict(checkpoint['amp'])
 
 
-def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_thresholds=None):
+def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_thresholds=None,
+                           is_mixed_precision=False):
     # Generate pseudo labels and save to disk (negligible time compared with training)
     # Not very sure if there are any cache inconsistency issues (technically this should be fine)
     net.eval()
@@ -151,8 +152,9 @@ def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_th
     with torch.no_grad():
         for images, file_name_lists, heights, widths in tqdm(loader):
             images = images.to(device)
-            outputs = net(images)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True)
+            with autocast(is_mixed_precision):
+                outputs = net(images)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True)
 
             # Generate pseudo labels (d1 x d2 x 2)
             for i in range(0, len(file_name_lists)):
@@ -180,7 +182,7 @@ def generate_pseudo_labels(net, device, loader, num_classes, input_size, cbst_th
 
 # Reimplemented (all converted to tensor ops) based on yzou2/CRST
 def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_classes, input_size,
-                                          down_sample_rate=16, buffer_size=100):
+                                          down_sample_rate=16, buffer_size=100, is_mixed_precision=False):
     # Max memory usage surge ratio has an upper limit of 2x (caused by array concatenation).
     # Keep a fixed GPU buffer size to achieve a good enough speed-memory trade-off,
     # since casting to cpu is very slow.
@@ -201,8 +203,9 @@ def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_
     with torch.no_grad():
         for images, _, heights, widths in tqdm(loader):
             images = images.to(device)
-            outputs = net(images)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True)
+            with autocast(is_mixed_precision):
+                outputs = net(images)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_size, mode='bilinear', align_corners=True)
 
             # Generate pseudo labels (d1 x d2) and reassemble
             for i in range(0, len(heights)):
@@ -246,4 +249,4 @@ def generate_class_balanced_pseudo_labels(net, device, loader, label_ratio, num_
 
     print(kc)
     return generate_pseudo_labels(net=net, device=device, loader=loader, cbst_thresholds=torch.tensor(kc),
-                                  input_size=input_size, num_classes=num_classes)
+                                  input_size=input_size, num_classes=num_classes, is_mixed_precision=is_mixed_precision)
diff --git a/segmentation/utils/transforms.py b/segmentation/utils/transforms.py
index d6ec7bb..48ea7b1 100644
--- a/segmentation/utils/transforms.py
+++ b/segmentation/utils/transforms.py
@@ -159,7 +159,7 @@ def label_to_tensor(pic):  # 3 dimensional arrays or normal segmentation masks
         if isinstance(pic, np.ndarray):
             return torch.as_tensor(pic.transpose((2, 0, 1)), dtype=torch.float32)
         else:
-            return torch.as_tensor(np.asarray(pic), dtype=torch.int64)
+            return torch.as_tensor(np.asarray(pic).copy(), dtype=torch.int64)
 
     def _pil_to_tensor(self, pic):
         # Convert a PIL Image to tensor(a direct copy)

From 223a10b6e27d98d86fd260701327265ee74f306e Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Thu, 3 Jun 2021 19:12:23 +0800
Subject: [PATCH 3/8] update pip

---
 README.md | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 1b59c3e..1e0462b 100644
--- a/README.md
+++ b/README.md
@@ -38,22 +38,16 @@ Also, thanks to [**@lorenmt**](https://github.com/lorenmt), a data augmentation
 
 You'll need a CUDA 10, Python3 environment (best on Linux) with PyTorch 1.2.0, TorchVision 0.4.0 and Apex to run the code in this repo.
 
-### 1. Setup the exact version of Apex & PyTorch & TorchVision for mixed precision training:
+### 1. Setup PyTorch & TorchVision:
 
 ```
-pip install https://download.pytorch.org/whl/cu100/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl && pip install https://download.pytorch.org/whl/cu100/torchvision-0.4.0-cp36-cp36m-manylinux1_x86_64.whl
-git clone https://github.com/NVIDIA/apex
-cd apex
-pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+pip install torch==1.6.0 torchvision==0.7.0
 ```
-!There seems to be an issue of apex installations from the official repo sometimes. If you encounter errors, we suggest you use our stored older apex [codes](https://drive.google.com/open?id=1x8enpvdTTZ3RChf17XvcLdSYulUPg3sR).
-
-**PyTorch 1.6** now includes automatic mixed precision at apex level "O1". We probably will update this repo accordingly in the future. 
 
 ### 2. Install other python packages you may require:
 
 ```
-pip install future matplotlib tensorboard tqdm
+pip install packages accelerate future matplotlib tensorboard tqdm
 ```
 
 ### 3. Download the code and prepare the scripts:

From 01815ea927122ee0104cac462adb563c52c90466 Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Thu, 3 Jun 2021 20:48:30 +0800
Subject: [PATCH 4/8] split batches

---
 segmentation/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/segmentation/main.py b/segmentation/main.py
index 713a654..67aff45 100644
--- a/segmentation/main.py
+++ b/segmentation/main.py
@@ -396,7 +396,7 @@ def after_loading():
     # device = torch.device('cpu')
     # if torch.cuda.is_available():
     #     device = torch.device('cuda:0')
-    accelerator = Accelerator()
+    accelerator = Accelerator(split_batches=True)
     device = accelerator.device
     if args.coco:  # This Caffe pre-trained model takes "inhuman" mean/std & input format
         mean = coco_mean

From 218633e2a878698b9afcba554bfb281b76bd7f78 Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Mon, 7 Jun 2021 13:38:31 +0800
Subject: [PATCH 5/8] modify other segmentation codes

---
 segmentation/main.py         |  4 +--
 segmentation/main_flip.py    | 68 ++++++++++++++++++++++--------------
 segmentation/main_naive.py   | 65 ++++++++++++++++++++--------------
 segmentation/main_online.py  | 64 ++++++++++++++++++++-------------
 segmentation/utils/common.py |  2 +-
 5 files changed, 122 insertions(+), 81 deletions(-)

diff --git a/segmentation/main.py b/segmentation/main.py
index 67aff45..56b11e7 100644
--- a/segmentation/main.py
+++ b/segmentation/main.py
@@ -241,7 +241,7 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                 current_step_num == num_epochs * len(loader_c) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
                 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net,
                                                               num_classes=num_classes, categories=categories,
                                                               output_size=input_sizes[2],
@@ -335,7 +335,7 @@ def after_loading():
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='voc',
diff --git a/segmentation/main_flip.py b/segmentation/main_flip.py
index 61bf1cb..fb21b65 100644
--- a/segmentation/main_flip.py
+++ b/segmentation/main_flip.py
@@ -14,11 +14,12 @@
 from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \
     ZeroPad, Compose, RandomScale
 from torch.utils.tensorboard import SummaryWriter
-from apex import amp
 from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \
     num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \
     load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city
 from utils.losses import FlipLoss as DynamicMutualLoss
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean,
@@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
     if with_sup:
         iter_sup = iter(loader_sup)
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     # Training
     running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0}
     while epoch < num_epochs:
@@ -202,22 +206,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             inputs = inputs.to(device)
             labels = labels.to(device)
             optimizer.zero_grad()
-            outputs = net(inputs)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
-            conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
+                conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
 
-            if with_sup:
-                loss, stats = criterion(outputs, probs, inputs_c.shape[0])
-            else:
-                loss, stats = criterion(outputs, labels)
+                if with_sup:
+                    loss, stats = criterion(outputs, probs, inputs_c.shape[0])
+                else:
+                    loss, stats = criterion(outputs, labels)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
-            optimizer.step()
+                accelerator.backward(loss)
+                optimizer.step()
             lr_scheduler.step()
 
             # Logging
@@ -235,10 +240,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                 current_step_num == num_epochs * len(loader_c) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
                 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net,
                                                               num_classes=num_classes, categories=categories,
-                                                              output_size=input_sizes[2])
+                                                              output_size=input_sizes[2],
+                                                              is_mixed_precision=is_mixed_precision)
                 writer.add_scalar(tensorboard_prefix + 'test pixel accuracy',
                                   test_pixel_accuracy,
                                   current_step_num)
@@ -282,16 +288,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
 
 
 # Copied and modified from torch/vision/references/segmentation
-def test_one_set(loader, device, net, categories, num_classes, output_size):
+def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision):
     # Evaluate on 1 data_loader (cudnn impact < 0.003%)
     net.eval()
     conf_mat = ConfusionMatrix(num_classes)
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)['out']
-            output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
-            conf_mat.update(target.flatten(), output.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                output = net(image)['out']
+                output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
+                conf_mat.update(target.flatten(), output.argmax(1).flatten())
 
     acc_global, acc, iu = conf_mat.compute()
     print(categories)
@@ -327,7 +334,7 @@ def after_loading():
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='voc',
@@ -385,9 +392,11 @@ def after_loading():
         exp_name = args.exp_name
     with open(exp_name + '_cfg.txt', 'w') as f:
         f.write(str(vars(args)))
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator(split_batches=True)
+    device = accelerator.device
     if args.coco:  # This Caffe pre-trained model takes "inhuman" mean/std & input format
         mean = coco_mean
         std = coco_std
@@ -423,8 +432,6 @@ def after_loading():
     ]
 
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     # Just to be safe (a little bit more memory, by all means, save it to disk if you want)
     if args.state == 1:
@@ -432,6 +439,7 @@ def after_loading():
 
     # Testing
     if args.state == 3:
+        net, optimizer = accelerator.prepare(net, optimizer)
         test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo,
                            state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug,
                            input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id,
@@ -439,7 +447,7 @@ def after_loading():
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
                         is_mixed_precision=args.mixed_precision, filename=args.continue_from)
         test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes,
-                     output_size=input_sizes[2])
+                     output_size=input_sizes[2], is_mixed_precision=args.mixed_precision)
     else:
         x = 0
         criterion = DynamicMutualLoss(gamma1=args.gamma1, gamma2=args.gamma2, ignore_index=255)
@@ -455,6 +463,7 @@ def after_loading():
                                               mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug,
                                               reverse_channels=reverse_channels)
             after_loading()
+            net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader)
             x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader,
                       device=device, criterion=criterion, net=net, optimizer=optimizer,
                       lr_scheduler=lr_scheduler,
@@ -471,10 +480,12 @@ def after_loading():
                     state=0, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer = accelerator.prepare(net, optimizer)
                 time_now = time.time()
                 ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader,
                                                               input_size=input_sizes[2],
-                                                              label_ratio=args.label_ratio, num_classes=num_classes)
+                                                              label_ratio=args.label_ratio, num_classes=num_classes,
+                                                              is_mixed_precision=args.mixed_precision)
                 print(ratio)
                 print('Pseudo labeling time: %.2fs' % (time.time() - time_now))
             else:
@@ -484,6 +495,9 @@ def after_loading():
                     state=1, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer,
+                                                                                            labeled_loader,
+                                                                                            pseudo_labeled_loader)
                 x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader,
                           validation_loader=val_loader, lr_scheduler=lr_scheduler,
                           device=device, criterion=criterion, net=net, optimizer=optimizer,
diff --git a/segmentation/main_naive.py b/segmentation/main_naive.py
index e194cc5..88c0cde 100644
--- a/segmentation/main_naive.py
+++ b/segmentation/main_naive.py
@@ -14,11 +14,12 @@
 from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \
     ZeroPad, Compose, RandomScale
 from torch.utils.tensorboard import SummaryWriter
-from apex import amp
 from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \
     num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \
     load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city
 from utils.losses import DynamicNaiveLoss as DynamicMutualLoss
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean,
@@ -169,6 +170,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
     if with_sup:
         iter_sup = iter(loader_sup)
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     # Training
     running_stats = {'disagree': -1, 'current_win': -1, 'avg_weights': 1.0, 'loss': 0.0}
     while epoch < num_epochs:
@@ -202,22 +206,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             inputs = inputs.to(device)
             labels = labels.to(device)
             optimizer.zero_grad()
-            outputs = net(inputs)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
-            conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
+                conf_mat.update(labels.flatten(), outputs.argmax(1).flatten())
 
-            if with_sup:
-                loss, stats = criterion(outputs, probs, inputs_c.shape[0])
-            else:
-                loss, stats = criterion(outputs, labels)
+                if with_sup:
+                    loss, stats = criterion(outputs, probs, inputs_c.shape[0])
+                else:
+                    loss, stats = criterion(outputs, labels)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
-            optimizer.step()
+                accelerator.backward(loss)
+                optimizer.step()
             lr_scheduler.step()
 
             # Logging
@@ -235,10 +240,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                 current_step_num == num_epochs * len(loader_c) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
                 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net,
                                                               num_classes=num_classes, categories=categories,
-                                                              output_size=input_sizes[2])
+                                                              output_size=input_sizes[2],
+                                                              is_mixed_precision=is_mixed_precision)
                 writer.add_scalar(tensorboard_prefix + 'test pixel accuracy',
                                   test_pixel_accuracy,
                                   current_step_num)
@@ -282,16 +288,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
 
 
 # Copied and modified from torch/vision/references/segmentation
-def test_one_set(loader, device, net, categories, num_classes, output_size):
+def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision):
     # Evaluate on 1 data_loader (cudnn impact < 0.003%)
     net.eval()
     conf_mat = ConfusionMatrix(num_classes)
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)['out']
-            output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
-            conf_mat.update(target.flatten(), output.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                output = net(image)['out']
+                output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
+                conf_mat.update(target.flatten(), output.argmax(1).flatten())
 
     acc_global, acc, iu = conf_mat.compute()
     print(categories)
@@ -327,7 +334,7 @@ def after_loading():
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='voc',
@@ -385,9 +392,11 @@ def after_loading():
         exp_name = args.exp_name
     with open(exp_name + '_cfg.txt', 'w') as f:
         f.write(str(vars(args)))
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator(split_batches=True)
+    device = accelerator.device
     if args.coco:  # This Caffe pre-trained model takes "inhuman" mean/std & input format
         mean = coco_mean
         std = coco_std
@@ -423,8 +432,6 @@ def after_loading():
     ]
 
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     # Just to be safe (a little bit more memory, by all means, save it to disk if you want)
     if args.state == 1:
@@ -432,6 +439,7 @@ def after_loading():
 
     # Testing
     if args.state == 3:
+        net, optimizer = accelerator.prepare(net, optimizer)
         test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo,
                            state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug,
                            input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id,
@@ -439,7 +447,7 @@ def after_loading():
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
                         is_mixed_precision=args.mixed_precision, filename=args.continue_from)
         test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes,
-                     output_size=input_sizes[2])
+                     output_size=input_sizes[2], is_mixed_precision=args.mixed_precision)
     else:
         x = 0
         criterion = DynamicMutualLoss(ignore_index=255)
@@ -455,6 +463,7 @@ def after_loading():
                                               mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug,
                                               reverse_channels=reverse_channels)
             after_loading()
+            net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader)
             x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader,
                       device=device, criterion=criterion, net=net, optimizer=optimizer,
                       lr_scheduler=lr_scheduler,
@@ -471,6 +480,7 @@ def after_loading():
                     state=0, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer = accelerator.prepare(net, optimizer)
                 time_now = time.time()
                 ratio = generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader,
                                                               input_size=input_sizes[2],
@@ -484,6 +494,9 @@ def after_loading():
                     state=1, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer,
+                                                                                            labeled_loader,
+                                                                                            pseudo_labeled_loader)
                 x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader,
                           validation_loader=val_loader, lr_scheduler=lr_scheduler,
                           device=device, criterion=criterion, net=net, optimizer=optimizer,
diff --git a/segmentation/main_online.py b/segmentation/main_online.py
index b6ccefc..d611598 100644
--- a/segmentation/main_online.py
+++ b/segmentation/main_online.py
@@ -14,11 +14,12 @@
 from utils.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, RandomResize, Resize, LabelMap, \
     ZeroPad, RandomScale, Compose
 from torch.utils.tensorboard import SummaryWriter
-from apex import amp
 from utils.common import colors_city, colors_voc, categories_voc, categories_city, sizes_city, sizes_voc, \
     num_classes_voc, num_classes_city, coco_mean, coco_std, imagenet_mean, imagenet_std, ConfusionMatrix, \
     load_checkpoint, save_checkpoint, generate_class_balanced_pseudo_labels, base_city, base_voc, label_id_map_city
 from utils.losses import OnlineLoss as DynamicMutualLoss
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def init(batch_size_labeled, batch_size_pseudo, state, split, input_sizes, sets_id, std, mean,
@@ -173,6 +174,9 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
     if with_sup:
         iter_sup = iter(loader_sup)
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     # Training
     running_stats = {'loss': 0.0}
     while epoch < num_epochs:
@@ -209,21 +213,23 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             labels_sup = labels_sup.to(device)
             # labels = labels.to(device)
             optimizer.zero_grad()
-            outputs = net(inputs)['out']
-            outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
-            conf_mat.update(labels_sup[inputs_c.shape[0]:].flatten(), outputs[inputs_c.shape[0]:].argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)['out']
+                outputs = torch.nn.functional.interpolate(outputs, size=input_sizes[0], mode='bilinear', align_corners=True)
+                conf_mat.update(labels_sup[inputs_c.shape[0]:].flatten(), outputs[inputs_c.shape[0]:].argmax(1).flatten())
 
-            if with_sup:
-                loss, stats = criterion(outputs, labels_sup, inputs_c.shape[0])
-            else:
-                loss, stats = criterion(outputs, labels_sup)
+                if with_sup:
+                    loss, stats = criterion(outputs, labels_sup, inputs_c.shape[0])
+                else:
+                    loss, stats = criterion(outputs, labels_sup)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
+                accelerator.backward(loss)
+                optimizer.step()
             optimizer.step()
             lr_scheduler.step()
 
@@ -242,10 +248,11 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                 current_step_num == num_epochs * len(loader_c) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
                 test_pixel_accuracy, test_mIoU = test_one_set(loader=validation_loader, device=device, net=net,
                                                               num_classes=num_classes, categories=categories,
-                                                              output_size=input_sizes[2])
+                                                              output_size=input_sizes[2],
+                                                              is_mixed_precision=is_mixed_precision)
                 writer.add_scalar(tensorboard_prefix + 'test pixel accuracy',
                                   test_pixel_accuracy,
                                   current_step_num)
@@ -289,16 +296,17 @@ def train(writer, loader_c, loader_sup, validation_loader, device, criterion, ne
 
 
 # Copied and modified from torch/vision/references/segmentation
-def test_one_set(loader, device, net, categories, num_classes, output_size):
+def test_one_set(loader, device, net, categories, num_classes, output_size, is_mixed_precision):
     # Evaluate on 1 data_loader (cudnn impact < 0.003%)
     net.eval()
     conf_mat = ConfusionMatrix(num_classes)
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)['out']
-            output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
-            conf_mat.update(target.flatten(), output.argmax(1).flatten())
+            with autocast(is_mixed_precision):
+                output = net(image)['out']
+                output = torch.nn.functional.interpolate(output, size=output_size, mode='bilinear', align_corners=True)
+                conf_mat.update(target.flatten(), output.argmax(1).flatten())
 
     acc_global, acc, iu = conf_mat.compute()
     print(categories)
@@ -334,7 +342,7 @@ def after_loading():
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='voc',
@@ -392,9 +400,11 @@ def after_loading():
         exp_name = args.exp_name
     with open(exp_name + '_cfg.txt', 'w') as f:
         f.write(str(vars(args)))
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator(split_batches=True)
+    device = accelerator.device
     if args.coco:  # This Caffe pre-trained model takes "inhuman" mean/std & input format
         mean = coco_mean
         std = coco_std
@@ -430,8 +440,6 @@ def after_loading():
     ]
 
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     # Just to be safe (a little bit more memory, by all means, save it to disk if you want)
     if args.state == 1:
@@ -439,6 +447,7 @@ def after_loading():
 
     # Testing
     if args.state == 3:
+        net, optimizer = accelerator.prepare(net, optimizer)
         test_loader = init(batch_size_labeled=args.batch_size_labeled, batch_size_pseudo=args.batch_size_pseudo,
                            state=3, split=None, valtiny=args.valtiny, no_aug=args.no_aug,
                            input_sizes=input_sizes, data_set=args.dataset, sets_id=args.sets_id,
@@ -446,7 +455,7 @@ def after_loading():
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
                         is_mixed_precision=args.mixed_precision, filename=args.continue_from)
         test_one_set(loader=test_loader, device=device, net=net, categories=categories, num_classes=num_classes,
-                     output_size=input_sizes[2])
+                     output_size=input_sizes[2], is_mixed_precision=args.mixed_precision)
     else:
         x = 0
         criterion = DynamicMutualLoss(ignore_index=255)
@@ -462,6 +471,7 @@ def after_loading():
                                               mean=mean, std=std, keep_scale=keep_scale, no_aug=args.no_aug,
                                               reverse_channels=reverse_channels)
             after_loading()
+            net, optimizer, labeled_loader = accelerator.prepare(net, optimizer, labeled_loader)
             x = train(writer=writer, loader_c=labeled_loader, loader_sup=None, validation_loader=val_loader,
                       device=device, criterion=criterion, net=net, optimizer=optimizer,
                       lr_scheduler=lr_scheduler,
@@ -478,6 +488,7 @@ def after_loading():
                     state=0, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer = accelerator.prepare(net, optimizer)
                 time_now = time.time()
                 generate_class_balanced_pseudo_labels(net=net, device=device, loader=unlabeled_loader,
                                                       input_size=input_sizes[0],
@@ -490,6 +501,9 @@ def after_loading():
                     state=1, split=args.train_set, input_sizes=input_sizes,
                     sets_id=args.sets_id, mean=mean, std=std, keep_scale=keep_scale, reverse_channels=reverse_channels)
                 after_loading()
+                net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer,
+                                                                                            labeled_loader,
+                                                                                            pseudo_labeled_loader)
                 x = train(writer=writer, loader_c=pseudo_labeled_loader, loader_sup=labeled_loader,
                           validation_loader=val_loader, lr_scheduler=lr_scheduler,
                           device=device, criterion=criterion, net=net, optimizer=optimizer,
diff --git a/segmentation/utils/common.py b/segmentation/utils/common.py
index 36d6146..2266ac4 100644
--- a/segmentation/utils/common.py
+++ b/segmentation/utils/common.py
@@ -4,7 +4,7 @@
 from tqdm import tqdm
 import time
 from utils.functional import crop
-from torch.cuda.amp import autocast, GradScaler
+from torch.cuda.amp import autocast
 
 # Base directories
 base_voc = '../../voc_seg_deeplab/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012'

From 68565d80716cd9a8e0c03acc1462b3092fbef33a Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Mon, 7 Jun 2021 13:39:10 +0800
Subject: [PATCH 6/8] upgrade classification codes for multi-GPU

---
 classification/main_dmt.py     | 63 +++++++++++++++++++++-------------
 classification/main_fs.py      | 46 +++++++++++++++----------
 classification/utils/common.py | 13 ++++---
 3 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/classification/main_dmt.py b/classification/main_dmt.py
index bf67b04..7a554b5 100644
--- a/classification/main_dmt.py
+++ b/classification/main_dmt.py
@@ -7,7 +7,6 @@
 import numpy as np
 from tqdm import tqdm
 from torch.utils.tensorboard import SummaryWriter
-from apex import amp
 from models.wideresnet import wrn_28_2
 from utils.common import num_classes_cifar10, mean_cifar10, std_cifar10, input_sizes_cifar10, base_cifar10, \
     load_checkpoint, save_checkpoint, EMA, rank_label_confidence
@@ -18,6 +17,8 @@
 from utils.randomrandaugment import RandomRandAugment
 from utils.cutout import Cutout
 from utils.autoaugment import CIFAR10Policy
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def get_transforms(auto_augment, input_sizes, m, mean, n, std):
@@ -48,8 +49,9 @@ def get_transforms(auto_augment, input_sizes, m, mean, n, std):
     return test_transforms, train_transforms
 
 
-def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filename):
-    k = rank_label_confidence(net=net, device=device, loader=loader, ratio=label_ratio, num_images=num_images)
+def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filename, is_mixed_precision):
+    k = rank_label_confidence(net=net, device=device, loader=loader, ratio=label_ratio, num_images=num_images,
+                              is_mixed_precision=is_mixed_precision)
     print(k)
     # 1 forward pass (build pickle file)
     selected_files = None
@@ -59,9 +61,10 @@ def generate_pseudo_labels(net, device, loader, label_ratio, num_images, filenam
         for images, original_file in tqdm(loader):
             # Inference
             images = images.to(device)
-            outputs = net(images)
-            temp = torch.nn.functional.softmax(input=outputs, dim=-1)  # ! softmax
-            pseudo_probabilities = temp.max(dim=-1).values
+            with autocast(is_mixed_precision):
+                outputs = net(images)
+                temp = torch.nn.functional.softmax(input=outputs, dim=-1)  # ! softmax
+                pseudo_probabilities = temp.max(dim=-1).values
 
             # Select
             temp_predictions = temp[pseudo_probabilities > k].cpu().numpy()
@@ -110,7 +113,7 @@ def init(mean, std, input_sizes, base, num_workers, prefix, val_set, train, batc
     return labeled_loader, unlabeled_loader, pseudo_labeled_loader, val_loader, unlabeled_set.__len__()
 
 
-def test(loader, device, net, fine_grain=False):
+def test(loader, device, net, fine_grain=False, is_mixed_precision=False):
     # Evaluate
     net.eval()
     test_correct = 0
@@ -119,7 +122,8 @@ def test(loader, device, net, fine_grain=False):
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)
+            with autocast(is_mixed_precision):
+                output = net(image)
             test_all += target.shape[0]
             if fine_grain:
                 predictions = output.softmax(1)
@@ -154,6 +158,9 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
     if val_num_steps is None:
         val_num_steps = min_len
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     net.train()
 
     # Use EMA to report final performance instead of select best checkpoint with valtiny
@@ -203,7 +210,8 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
                     split_index=inputs_pseudo.shape[0], labeled_weight=labeled_weight)
                 inputs, dynamic_weights, labels_a, labels_b, lam = mixup_data(x=inputs, w=dynamic_weights, y=labels,
                                                                               alpha=alpha, keep_max=True)
-            outputs = net(inputs)
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)
 
             if alpha != -1:
                 # Pseudo training accuracy & interesting loss
@@ -218,12 +226,12 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
                                                    gamma1=gamma1, gamma2=gamma2)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
-            optimizer.step()
+                accelerator.backward(loss)
+                optimizer.step()
             criterion.step()
             if lr_scheduler is not None:
                 lr_scheduler.step()
@@ -252,8 +260,9 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                current_step_num == num_epochs * len(pseudo_labeled_loader) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
-                test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain)
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
+                test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain,
+                                is_mixed_precision=is_mixed_precision)
                 writer.add_scalar(tensorboard_prefix + 'test accuracy',
                                   test_acc,
                                   current_step_num)
@@ -284,7 +293,7 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='cifar10',
@@ -350,9 +359,11 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
     # torch.backends.cudnn.benchmark = False  # Might hurt performance
     if args.exp_name != 'auto':
         exp_name = args.exp_name
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator(split_batches=True)
+    device = accelerator.device
     if args.valtiny:
         val_set = 'valtiny_seed1'
     else:
@@ -373,8 +384,6 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
     params_to_optimize = net.parameters()
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
     # optimizer = torch.optim.Adam(params_to_optimize, lr=args.lr, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     if args.continue_from is not None:
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
@@ -385,13 +394,18 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
         dataset=args.dataset, n=args.n, m=args.m, auto_augment=args.aa, input_sizes=input_sizes, std=std,
         num_workers=args.num_workers, batch_size_pseudo=args.batch_size_pseudo, train=False if args.labeling else True)
 
+    net, optimizer, labeled_loader, pseudo_labeled_loader = accelerator.prepare(net, optimizer,
+                                                                                labeled_loader,
+                                                                                pseudo_labeled_loader)
+
     # Pseudo labeling
     if args.labeling:
         time_now = time.time()
         sub_base = CIFAR10.base_folder
         filename = os.path.join(base, sub_base, args.train_set + '_pseudo')
         generate_pseudo_labels(net=net, device=device, loader=unlabeled_loader, filename=filename,
-                               label_ratio=args.label_ratio, num_images=num_images)
+                               label_ratio=args.label_ratio, num_images=num_images,
+                               is_mixed_precision=args.mixed_precision)
         print('Pseudo labeling time: %.2fs' % (time.time() - time_now))
     else:
         # Mutual-training
@@ -402,7 +416,8 @@ def train(writer, labeled_loader, pseudo_labeled_loader, val_loader, device, cri
                                                T_max=args.epochs * len(pseudo_labeled_loader))
         writer = SummaryWriter('logs/' + exp_name)
 
-        best_acc = test(loader=val_loader, device=device, net=net, fine_grain=args.fine_grain)
+        best_acc = test(loader=val_loader, device=device, net=net, fine_grain=args.fine_grain,
+                        is_mixed_precision=args.mixed_precision)
         save_checkpoint(net=net, optimizer=None, lr_scheduler=None, is_mixed_precision=args.mixed_precision)
         print('Original acc: ' + str(best_acc))
 
diff --git a/classification/main_fs.py b/classification/main_fs.py
index fa03bea..6613bcf 100644
--- a/classification/main_fs.py
+++ b/classification/main_fs.py
@@ -7,7 +7,6 @@
 from tqdm import tqdm
 from torch.utils.tensorboard import SummaryWriter
 from torchvision.transforms import Compose, RandomCrop, RandomHorizontalFlip, Normalize, ToTensor
-from apex import amp
 from utils.randomrandaugment import RandomRandAugment
 from models.wideresnet import wrn_28_2
 from utils.common import num_classes_cifar10, mean_cifar10, std_cifar10, input_sizes_cifar10, base_cifar10, \
@@ -16,6 +15,8 @@
 from utils.autoaugment import CIFAR10Policy
 from utils.datasets import CIFAR10
 from utils.mixup import mixup_criterion, mixup_data
+from accelerate import Accelerator
+from torch.cuda.amp import autocast, GradScaler
 
 
 def init(batch_size, state, mean, std, input_sizes, base, num_workers, train_set, val_set, rand_augment=True,
@@ -88,6 +89,9 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l
     if val_num_steps is None:
         val_num_steps = len(train_loader)
 
+    if is_mixed_precision:
+        scaler = GradScaler()
+
     net.train()
 
     # Use EMA to report final performance instead of select best checkpoint with valtiny
@@ -112,7 +116,8 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l
             if alpha is not None:
                 inputs, labels_a, labels_b, lam = mixup_data(x=inputs, y=labels, alpha=alpha)
 
-            outputs = net(inputs)
+            with autocast(is_mixed_precision):
+                outputs = net(inputs)
 
             if alpha is not None:
                 # Pseudo training accuracy & interesting loss
@@ -125,12 +130,12 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l
                 loss = criterion(outputs, labels)
 
             if is_mixed_precision:
-                # 2/3 & 3/3 of mixed precision training with amp
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
+                accelerator.backward(scaler.scale(loss))
+                scaler.step(optimizer)
+                scaler.update()
             else:
-                loss.backward()
-            optimizer.step()
+                accelerator.backward(loss)
+                optimizer.step()
             if lr_scheduler is not None:
                 lr_scheduler.step()
 
@@ -150,8 +155,9 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l
             # Validate and find the best snapshot
             if current_step_num % val_num_steps == (val_num_steps - 1) or \
                current_step_num == num_epochs * len(train_loader) - 1:
-                # A bug in Apex? https://github.com/NVIDIA/apex/issues/706
-                test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain)
+                # Apex bug https://github.com/NVIDIA/apex/issues/706, fixed in PyTorch1.6, kept here for BC
+                test_acc = test(loader=val_loader, device=device, net=net, fine_grain=fine_grain,
+                                is_mixed_precision=is_mixed_precision)
                 writer.add_scalar('test accuracy',
                                   test_acc,
                                   current_step_num)
@@ -180,7 +186,7 @@ def train(writer, train_loader, val_loader, device, criterion, net, optimizer, l
     return best_acc
 
 
-def test(loader, device, net, fine_grain=False):
+def test(loader, device, net, fine_grain=False, is_mixed_precision=False):
     # Evaluate
     net.eval()
     test_correct = 0
@@ -189,7 +195,8 @@ def test(loader, device, net, fine_grain=False):
     with torch.no_grad():
         for image, target in tqdm(loader):
             image, target = image.to(device), target.to(device)
-            output = net(image)
+            with autocast(is_mixed_precision):
+                output = net(image)
             test_all += target.shape[0]
             if fine_grain:
                 predictions = output.softmax(1)
@@ -213,7 +220,7 @@ def test(loader, device, net, fine_grain=False):
 
 if __name__ == '__main__':
     # Settings
-    parser = argparse.ArgumentParser(description='PyTorch 1.2.0 && torchvision 0.4.0')
+    parser = argparse.ArgumentParser(description='PyTorch 1.6.0 && torchvision 0.7.0')
     parser.add_argument('--exp-name', type=str, default='auto',
                         help='Name of the experiment (default: auto)')
     parser.add_argument('--dataset', type=str, default='cifar10',
@@ -269,9 +276,11 @@ def test(loader, device, net, fine_grain=False):
     # torch.backends.cudnn.benchmark = False  # Might hurt performance
     if args.exp_name != 'auto':
         exp_name = args.exp_name
-    device = torch.device('cpu')
-    if torch.cuda.is_available():
-        device = torch.device('cuda:0')
+    # device = torch.device('cpu')
+    # if torch.cuda.is_available():
+    #     device = torch.device('cuda:0')
+    accelerator = Accelerator(split_batches=True)
+    device = accelerator.device
     if args.valtiny:
         val_set = 'valtiny_seed1'
     else:
@@ -293,16 +302,16 @@ def test(loader, device, net, fine_grain=False):
     params_to_optimize = net.parameters()
     optimizer = torch.optim.SGD(params_to_optimize, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
     # optimizer = torch.optim.Adam(params_to_optimize, lr=args.lr, weight_decay=args.weight_decay)
-    if args.mixed_precision:
-        net, optimizer = amp.initialize(net, optimizer, opt_level='O1')
 
     # Testing
     if args.state == 2:
+        net, optimizer = accelerator.prepare(net, optimizer)
         test_loader = init(batch_size=args.batch_size, state=2, mean=mean, std=std, train_set=None, val_set=val_set,
                            input_sizes=input_sizes, base=base, num_workers=args.num_workers, dataset=args.dataset)
         load_checkpoint(net=net, optimizer=None, lr_scheduler=None,
                         is_mixed_precision=args.mixed_precision, filename=args.continue_from)
-        x = test(loader=test_loader, device=device, net=net, fine_grain=args.fine_grain)
+        x = test(loader=test_loader, device=device, net=net, fine_grain=args.fine_grain,
+                 is_mixed_precision=args.mixed_precision)
         with open(args.log + '.txt', 'a') as f:
             f.write('test: ' + str(x) + '\n')
 
@@ -315,6 +324,7 @@ def test(loader, device, net, fine_grain=False):
                                         train_set=args.train_set, val_set=val_set, n=args.n, m=args.m,
                                         rand_augment=args.ra,
                                         input_sizes=input_sizes, num_workers=args.num_workers, dataset=args.dataset)
+        net, optimizer, train_loader = accelerator.prepare(net, optimizer, train_loader)
         lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
                                                                   T_max=args.epochs * len(train_loader))
         # lr_scheduler = None
diff --git a/classification/utils/common.py b/classification/utils/common.py
index 9e901a2..c431f5b 100644
--- a/classification/utils/common.py
+++ b/classification/utils/common.py
@@ -1,9 +1,11 @@
 import torch
 import numpy as np
 import matplotlib.pyplot as plt
-from apex import amp
 from tqdm import tqdm
 import collections
+from accelerate import Accelerator
+from torch.cuda.amp import autocast
+
 
 mean_cifar10 = [0.49137, 0.48235, 0.44667]
 std_cifar10 = [0.24706, 0.24353, 0.26157]
@@ -49,7 +51,7 @@ def load_checkpoint(net, optimizer, lr_scheduler, is_mixed_precision, filename):
 
 
 # Count for threshold (k) to select top confident labels
-def rank_label_confidence(net, device, loader, ratio, num_images):
+def rank_label_confidence(net, device, loader, ratio, num_images, is_mixed_precision):
     net.eval()
     if ratio >= 1:
         k = 0
@@ -61,9 +63,10 @@ def rank_label_confidence(net, device, loader, ratio, num_images):
             for images, _ in tqdm(loader):
                 # Inference
                 images = images.to(device)
-                outputs = net(images)
-                temp = torch.nn.functional.softmax(input=outputs, dim=1)  # ! softmax
-                pseudo_probabilities = temp.max(dim=1).values
+                with autocast(is_mixed_precision):
+                    outputs = net(images)
+                    temp = torch.nn.functional.softmax(input=outputs, dim=1)  # ! softmax
+                    pseudo_probabilities = temp.max(dim=1).values
                 temp_len = pseudo_probabilities.shape[0]
 
                 # Count

From d38e19284908b43f5de64892c8ac803064c1fb0a Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Mon, 7 Jun 2021 13:49:54 +0800
Subject: [PATCH 7/8] doc

---
 CLASSIFICATION.md | 14 +++++++++++---
 README.md         |  9 ++++++---
 SEGMENTATION.md   | 12 ++++++++++--
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/CLASSIFICATION.md b/CLASSIFICATION.md
index 8c1b92f..e6d2da0 100644
--- a/CLASSIFICATION.md
+++ b/CLASSIFICATION.md
@@ -34,7 +34,15 @@ The CIFAR-10 dataset can be downloaded and splitted to 5 random splits and valid
 
 ## Run the code
 
-We provide examples in scripts and commands. Final results can be found at log.txt after training.
+For multi-GPU/TPU/Distributed machine users, first run:
+
+```
+accelerate config
+```
+
+More details can be found at [Accelerate](https://github.com/huggingface/accelerate). Note that the mixed precision config cannot be used, you should still use `--mixed-precision` for that.
+
+We provide examples in scripts and commands. Final results can be found at `log.txt` after training.
 
 For example, with 1000 labels, to compare CL and DMT in a controlled experiment with same baseline model to start training:
 
@@ -43,6 +51,6 @@ For example, with 1000 labels, to compare CL and DMT in a controlled experiment
 ./ss-dmt-full-1.sh
 ```
 
-Of course you'll need to run 5 times average to determine performance by changing the *seed* parameter (we used 1,2,3,4,5) in shell scripts.
+You'll need to run 5 times average to determine performance by changing the `seed` parameter (we used 1,2,3,4,5) in shell scripts.
 
-For small validation set, use *--valtiny*; for fine-grained testing, use *--fine-grain*.
+For small validation set, use `--valtiny`; for fine-grained testing, use `--fine-grain`.
diff --git a/README.md b/README.md
index 1e0462b..bec9d9b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema
 
 ## News
 
-### 2021.6.3
+### 2021.6.7
 
 **Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6.
 Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang).
@@ -36,7 +36,7 @@ Also, thanks to [**@lorenmt**](https://github.com/lorenmt), a data augmentation
 
 ## Setup
 
-You'll need a CUDA 10, Python3 environment (best on Linux) with PyTorch 1.2.0, TorchVision 0.4.0 and Apex to run the code in this repo.
+First, you'll need a CUDA 10, Python3 environment (best on Linux).
 
 ### 1. Setup PyTorch & TorchVision:
 
@@ -47,7 +47,7 @@ pip install torch==1.6.0 torchvision==0.7.0
 ### 2. Install other python packages you may require:
 
 ```
-pip install packages accelerate future matplotlib tensorboard tqdm
+pip install packaging accelerate future matplotlib tensorboard tqdm
 ```
 
 ### 3. Download the code and prepare the scripts:
@@ -66,9 +66,11 @@ Get started with [SEGMENTATION.md](SEGMENTATION.md) for semantic segmentation.
 Get started with [CLASSIFICATION.md](CLASSIFICATION.md) for image classification.
 
 ## Understand the code
+
 We refer interested readers to this repository's [wiki](https://github.com/voldemortX/DST-CBC/wiki). *It is not updated for DMT yet.*
 
 ## Notes
+
 It's best to use a **Turing** or **Volta** architecture GPU when running our code, since they have tensor cores and the computation speed is much faster with mixed precision. For instance, RTX 2080 Ti (which is what we used) or Tesla V100, RTX 20/30 series.
 
 Our implementation is fast and memory efficient. A whole run (train 2 models by DMT on PASCAL VOC 2012) takes about 8 hours on a single RTX 2080 Ti using up to 6GB graphic memory, including on-the-fly evaluations and training baselines. The Cityscapes experiments are even faster.
@@ -98,3 +100,4 @@ The CBC part of the older version DST-CBC is adapted from [CRST](https://github.
 
 The overall implementation is based on [TorchVision](https://github.com/pytorch/vision) and [PyTorch](https://github.com/pytorch/pytorch).
 
+The people who've helped to make the method & code better: [**lorenmt**](https://github.com/lorenmt), [**jinhuan-hit**](https://github.com/jinhuan-hit), [**TiankaiHang**](https://github.com/TiankaiHang), etc.
diff --git a/SEGMENTATION.md b/SEGMENTATION.md
index 748f25a..c33a7d0 100644
--- a/SEGMENTATION.md
+++ b/SEGMENTATION.md
@@ -97,6 +97,14 @@ ImageNet pre-trained weights will be automatically downloaded when running code.
 
 ## Run the code
 
+For multi-GPU/TPU/Distributed machine users, first run:
+
+```
+accelerate config
+```
+
+More details can be found at [Accelerate](https://github.com/huggingface/accelerate). Note that the mixed precision config cannot be used, you should still use `--mixed-precision` for that.
+
 We provide examples in scripts and commands. Final results can be found at log.txt after training.
 
 For example, run DMT with different pre-trained weights:
@@ -120,8 +128,8 @@ python pascal_sbd_split.py
 ```
 
 
-Of course you'll need to run 3 times average to determine performance by changing the *sid* parameter (we used 0,1,2) in shell scripts.
+Of course you'll need to run 3 times average to determine performance by changing the `sid` parameter (we used 0,1,2) in shell scripts.
 
 We also provide scripts for ablations, be sure to run *abl_baseline.sh* first. 
 
-For small validation set, use *--valtiny*.
+For small validation set, use `--valtiny`.

From 9594c90aad6fa6a85ae2b13c2ff3a519bd8cfcfc Mon Sep 17 00:00:00 2001
From: voldemortX <zyfeng97@gmail.com>
Date: Mon, 7 Jun 2021 13:57:09 +0800
Subject: [PATCH 8/8] notes

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bec9d9b..2789b00 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@ This repository contains the code for our paper [DMT: Dynamic Mutual Training fo
 
 Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Semantic Segmentation via Dynamic Self-Training and Class-Balanced Curriculum*, if you want the old code, you can check out the [dst-cbc](https://github.com/voldemortX/DST-CBC/tree/dst-cbc) branch.
 
+Also, for older PyTorch version (<1.6.0) users, or the **exact** same environment that produced the paper's results, refer to 53853f6.
+
 <div align="center">
   <img src="overview.png"/>
 </div>
@@ -12,7 +14,7 @@ Some might know it as the previous version **DST-CBC**, or *Semi-Supervised Sema
 
 ### 2021.6.7
 
-**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is updated to PyTorch 1.6.
+**Multi-GPU** training support (based on [Accelerate](https://github.com/huggingface/accelerate)) is added, and the whole project is upgraded to PyTorch 1.6.
 Thanks to the codes & testing by [**@jinhuan-hit**](https://github.com/jinhuan-hit), and discussions from [**@lorenmt**](https://github.com/lorenmt), [**@TiankaiHang**](https://github.com/TiankaiHang).
 
 ### 2021.2.10