From ddad8e337613540e83bf06bff63ebf9f16684580 Mon Sep 17 00:00:00 2001
From: Cai Shaofeng <solopku@hotmail.com>
Date: Tue, 17 Sep 2024 17:09:43 +0800
Subject: [PATCH] Update the training script for the cnn ms example

Update the training script for the cnn ms example
---
 examples/cnn_ms/train_ms_model.py | 280 +++++++++++++++++++++++++++++-
 1 file changed, 279 insertions(+), 1 deletion(-)

diff --git a/examples/cnn_ms/train_ms_model.py b/examples/cnn_ms/train_ms_model.py
index d306a458c..529db1c46 100644
--- a/examples/cnn_ms/train_ms_model.py
+++ b/examples/cnn_ms/train_ms_model.py
@@ -55,6 +55,7 @@ def call_with_returns(self, loss):
             pn_p_g_list.append([p.name, p, g])  # need iterables
         return pn_p_g_list
 
+# MSSGD -- actually no change of code
 class MSSGD(MSOptimizer):
     """Implements stochastic gradient descent (optionally with momentum).
     Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
@@ -205,6 +206,283 @@ def set_states(self, states):
             self.mom_value = self.momentum(self.step_counter)
 
 
+# Data augmentation
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :,
+                                    offset[0]:offset[0] + x.shape[2],
+                                    offset[1]:offset[1] + x.shape[2]]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+# Calculate accuracy
+def accuracy(pred, target):
+    # y is network output to be compared with ground truth (int)
+    y = np.argmax(pred, axis=1)
+    a = y == target
+    correct = np.array(a, "int").sum()
+    return correct
+
+
+# Data partition according to the rank
+def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
+    # Partition training data
+    data_per_rank = train_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    train_x = train_x[idx_start:idx_end]
+    train_y = train_y[idx_start:idx_end]
+
+    # Partition evaluation data
+    data_per_rank = val_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    val_x = val_x[idx_start:idx_end]
+    val_y = val_y[idx_start:idx_end]
+    return train_x, train_y, val_x, val_y
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+def resize_dataset(x, image_size):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, image_size, image_size),
+                 dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+                (image_size, image_size), Image.BILINEAR),
+                                     dtype=np.float32)
+    return X
+
+
+def run(global_rank,
+        world_size,
+        local_rank,
+        max_epoch,
+        batch_size,
+        model,
+        data,
+        mssgd,
+        graph,
+        verbosity,
+        dist_option='plain',
+        spars=None,
+        precision='float32'):
+    # dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
+    dev = device.get_default_device()
+    dev.SetRandSeed(0)
+    np.random.seed(0)
+
+    if data == 'cifar10':
+        from data import cifar10
+        train_x, train_y, val_x, val_y = cifar10.load()
+    elif data == 'cifar100':
+        from data import cifar100
+        train_x, train_y, val_x, val_y = cifar100.load()
+    elif data == 'mnist':
+        from data import mnist
+        train_x, train_y, val_x, val_y = mnist.load()
+
+
+    num_channels = train_x.shape[1]
+    image_size = train_x.shape[2]
+    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
+    num_classes = (np.max(train_y) + 1).item()
+
+    if model == 'resnet':
+        from model import resnet
+        model = resnet.resnet50(num_channels=num_channels,
+                                num_classes=num_classes)
+    elif model == 'xceptionnet':
+        from model import xceptionnet
+        model = xceptionnet.create_model(num_channels=num_channels,
+                                         num_classes=num_classes)
+    elif model == 'cnn':
+        from model import cnn
+        model = cnn.create_model(num_channels=num_channels,
+                                 num_classes=num_classes)
+    elif model == 'alexnet':
+        from model import alexnet
+        model = alexnet.create_model(num_channels=num_channels,
+                                     num_classes=num_classes)
+    elif model == 'mlp':
+        import os, sys, inspect
+        current = os.path.dirname(
+            os.path.abspath(inspect.getfile(inspect.currentframe())))
+        parent = os.path.dirname(current)
+        sys.path.insert(0, parent)
+        from mlp import model
+        model = model.create_model(data_size=data_size,
+                                    num_classes=num_classes)
+
+    elif model == 'msmlp':
+        import os, sys, inspect
+        current = os.path.dirname(
+            os.path.abspath(inspect.getfile(inspect.currentframe())))
+        parent = os.path.dirname(current)
+        sys.path.insert(0, parent)
+        from msmlp import model
+        model = model.create_model(data_size=data_size,
+                                    num_classes=num_classes)
+
+    # For distributed training, sequential has better performance
+    if hasattr(mssgd, "communicator"):
+        DIST = True
+        sequential = True
+    else:
+        DIST = False
+        sequential = False
+
+    if DIST:
+        train_x, train_y, val_x, val_y = partition(global_rank, world_size,
+                                                   train_x, train_y, val_x,
+                                                   val_y)
+
+    if model.dimension == 4:
+        tx = tensor.Tensor(
+            (batch_size, num_channels, model.input_size, model.input_size), dev,
+            singa_dtype[precision])
+    elif model.dimension == 2:
+        tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
+        np.reshape(train_x, (train_x.shape[0], -1))
+        np.reshape(val_x, (val_x.shape[0], -1))
+
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_val_batch = val_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    # Attach model to graph
+    model.set_optimizer(mssgd)
+    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
+    dev.SetVerbosity(verbosity)
+
+    # Training and evaluation loop
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if global_rank == 0:
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        model.train()
+        print ("num_train_batch: \n", num_train_batch)
+        print ()
+        for b in range(num_train_batch):
+            if b % 200 == 0:
+                print ("b: \n", b)
+            # Generate the patch data in this iteration
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            if model.dimension == 4:
+                x = augmentation(x, batch_size)
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+
+
+            synflow_flag = False
+            # Train the model
+            if epoch == (max_epoch - 1) and b == (num_train_batch - 1):  ### synflow calcuation for the last batch
+                print ("last epoch calculate synflow")
+                synflow_flag = True
+                ### step 1: all one input
+                # Copy the patch data into input tensors
+                tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
+                ty.copy_from_numpy(y)
+                ### step 2: all weights turned to positive (done)
+                ### step 3: new loss (done)
+                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
+                ### step 4: calculate the multiplication of weights
+                synflow_score = 0.0
+                for pn_p_g_item in pn_p_g_list:
+                    print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
+                    if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
+                        print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
+                        synflow_score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
+                print ("synflow_score: \n", synflow_score)
+            elif epoch == (max_epoch - 1) and b == (num_train_batch - 2): # all weights turned to positive
+                # Copy the patch data into input tensors
+                tx.copy_from_numpy(x)
+                ty.copy_from_numpy(y)
+                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
+                train_correct += accuracy(tensor.to_numpy(out), y)
+                train_loss += tensor.to_numpy(loss)[0]
+                # all params turned to positive
+                for pn_p_g_item in pn_p_g_list:
+                    print ("absolute value parameter name: \n", pn_p_g_item[0])
+                    pn_p_g_item[1] = tensor.abs(pn_p_g_item[1])  # tensor actually ...
+            else:  # normal train steps
+                # Copy the patch data into input tensors
+                tx.copy_from_numpy(x)
+                ty.copy_from_numpy(y)
+                # print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
+                # print ("train_cnn tx: \n", tx)
+                # print ("train_cnn ty: \n", ty)
+                pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
+                # print ("normal after model(tx, ty, synflow_flag, dist_option, spars)")
+                train_correct += accuracy(tensor.to_numpy(out), y)
+                train_loss += tensor.to_numpy(loss)[0]
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, mssgd, reducer)
+            train_loss = reduce_variable(train_loss, mssgd, reducer)
+
+        if global_rank == 0:
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        # Evaluation phase
+        model.eval()
+        for b in range(num_val_batch):
+            x = val_x[b * batch_size:(b + 1) * batch_size]
+            if model.dimension == 4:
+                if (image_size != model.input_size):
+                    x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
+            y = val_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+        if DIST:
+            # Reduce the evaulation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, mssgd, reducer)
+
+        # Output the evaluation accuracy
+        if global_rank == 0:
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_val_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+    dev.PrintTimeProfiling()
+
+
 if __name__ == '__main__':
     # Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
     parser = argparse.ArgumentParser(
@@ -271,4 +549,4 @@ def set_states(self, states):
         mssgd,
         args.graph,
         args.verbosity,
-        precision=args.precision)
\ No newline at end of file
+        precision=args.precision)