Skip to content

Commit

Permalink
Merge pull request #1220 from solopku/patch-7
Browse files Browse the repository at this point in the history
Update the training script for the cnn ms example
  • Loading branch information
chrishkchris authored Sep 17, 2024
2 parents d5ddd10 + ddad8e3 commit 33218e1
Showing 1 changed file with 279 additions and 1 deletion.
280 changes: 279 additions & 1 deletion examples/cnn_ms/train_ms_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def call_with_returns(self, loss):
pn_p_g_list.append([p.name, p, g]) # need iterables
return pn_p_g_list

# MSSGD -- actually no change of code
class MSSGD(MSOptimizer):
"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
Expand Down Expand Up @@ -205,6 +206,283 @@ def set_states(self, states):
self.mom_value = self.momentum(self.step_counter)


# Data augmentation
def augmentation(x, batch_size):
xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
for data_num in range(0, batch_size):
offset = np.random.randint(8, size=2)
x[data_num, :, :, :] = xpad[data_num, :,
offset[0]:offset[0] + x.shape[2],
offset[1]:offset[1] + x.shape[2]]
if_flip = np.random.randint(2)
if (if_flip):
x[data_num, :, :, :] = x[data_num, :, :, ::-1]
return x


# Calculate accuracy
def accuracy(pred, target):
# y is network output to be compared with ground truth (int)
y = np.argmax(pred, axis=1)
a = y == target
correct = np.array(a, "int").sum()
return correct


# Data partition according to the rank
def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
# Partition training data
data_per_rank = train_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
train_x = train_x[idx_start:idx_end]
train_y = train_y[idx_start:idx_end]

# Partition evaluation data
data_per_rank = val_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
val_x = val_x[idx_start:idx_end]
val_y = val_y[idx_start:idx_end]
return train_x, train_y, val_x, val_y


# Function to all reduce NUMPY accuracy and loss from multiple devices
def reduce_variable(variable, dist_opt, reducer):
reducer.copy_from_numpy(variable)
dist_opt.all_reduce(reducer.data)
dist_opt.wait()
output = tensor.to_numpy(reducer)
return output


def resize_dataset(x, image_size):
num_data = x.shape[0]
dim = x.shape[1]
X = np.zeros(shape=(num_data, dim, image_size, image_size),
dtype=np.float32)
for n in range(0, num_data):
for d in range(0, dim):
X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
(image_size, image_size), Image.BILINEAR),
dtype=np.float32)
return X


def run(global_rank,
world_size,
local_rank,
max_epoch,
batch_size,
model,
data,
mssgd,
graph,
verbosity,
dist_option='plain',
spars=None,
precision='float32'):
# dev = device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
dev = device.get_default_device()
dev.SetRandSeed(0)
np.random.seed(0)

if data == 'cifar10':
from data import cifar10
train_x, train_y, val_x, val_y = cifar10.load()
elif data == 'cifar100':
from data import cifar100
train_x, train_y, val_x, val_y = cifar100.load()
elif data == 'mnist':
from data import mnist
train_x, train_y, val_x, val_y = mnist.load()


num_channels = train_x.shape[1]
image_size = train_x.shape[2]
data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
num_classes = (np.max(train_y) + 1).item()

if model == 'resnet':
from model import resnet
model = resnet.resnet50(num_channels=num_channels,
num_classes=num_classes)
elif model == 'xceptionnet':
from model import xceptionnet
model = xceptionnet.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'cnn':
from model import cnn
model = cnn.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'alexnet':
from model import alexnet
model = alexnet.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'mlp':
import os, sys, inspect
current = os.path.dirname(
os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)
from mlp import model
model = model.create_model(data_size=data_size,
num_classes=num_classes)

elif model == 'msmlp':
import os, sys, inspect
current = os.path.dirname(
os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)
from msmlp import model
model = model.create_model(data_size=data_size,
num_classes=num_classes)

# For distributed training, sequential has better performance
if hasattr(mssgd, "communicator"):
DIST = True
sequential = True
else:
DIST = False
sequential = False

if DIST:
train_x, train_y, val_x, val_y = partition(global_rank, world_size,
train_x, train_y, val_x,
val_y)

if model.dimension == 4:
tx = tensor.Tensor(
(batch_size, num_channels, model.input_size, model.input_size), dev,
singa_dtype[precision])
elif model.dimension == 2:
tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
np.reshape(train_x, (train_x.shape[0], -1))
np.reshape(val_x, (val_x.shape[0], -1))

ty = tensor.Tensor((batch_size,), dev, tensor.int32)
num_train_batch = train_x.shape[0] // batch_size
num_val_batch = val_x.shape[0] // batch_size
idx = np.arange(train_x.shape[0], dtype=np.int32)

# Attach model to graph
model.set_optimizer(mssgd)
model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
dev.SetVerbosity(verbosity)

# Training and evaluation loop
for epoch in range(max_epoch):
start_time = time.time()
np.random.shuffle(idx)

if global_rank == 0:
print('Starting Epoch %d:' % (epoch))

# Training phase
train_correct = np.zeros(shape=[1], dtype=np.float32)
test_correct = np.zeros(shape=[1], dtype=np.float32)
train_loss = np.zeros(shape=[1], dtype=np.float32)

model.train()
print ("num_train_batch: \n", num_train_batch)
print ()
for b in range(num_train_batch):
if b % 200 == 0:
print ("b: \n", b)
# Generate the patch data in this iteration
x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
if model.dimension == 4:
x = augmentation(x, batch_size)
if (image_size != model.input_size):
x = resize_dataset(x, model.input_size)
x = x.astype(np_dtype[precision])
y = train_y[idx[b * batch_size:(b + 1) * batch_size]]


synflow_flag = False
# Train the model
if epoch == (max_epoch - 1) and b == (num_train_batch - 1): ### synflow calcuation for the last batch
print ("last epoch calculate synflow")
synflow_flag = True
### step 1: all one input
# Copy the patch data into input tensors
tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
ty.copy_from_numpy(y)
### step 2: all weights turned to positive (done)
### step 3: new loss (done)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
### step 4: calculate the multiplication of weights
synflow_score = 0.0
for pn_p_g_item in pn_p_g_list:
print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
synflow_score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
print ("synflow_score: \n", synflow_score)
elif epoch == (max_epoch - 1) and b == (num_train_batch - 2): # all weights turned to positive
# Copy the patch data into input tensors
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
train_correct += accuracy(tensor.to_numpy(out), y)
train_loss += tensor.to_numpy(loss)[0]
# all params turned to positive
for pn_p_g_item in pn_p_g_list:
print ("absolute value parameter name: \n", pn_p_g_item[0])
pn_p_g_item[1] = tensor.abs(pn_p_g_item[1]) # tensor actually ...
else: # normal train steps
# Copy the patch data into input tensors
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
# print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
# print ("train_cnn tx: \n", tx)
# print ("train_cnn ty: \n", ty)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
# print ("normal after model(tx, ty, synflow_flag, dist_option, spars)")
train_correct += accuracy(tensor.to_numpy(out), y)
train_loss += tensor.to_numpy(loss)[0]

if DIST:
# Reduce the evaluation accuracy and loss from multiple devices
reducer = tensor.Tensor((1,), dev, tensor.float32)
train_correct = reduce_variable(train_correct, mssgd, reducer)
train_loss = reduce_variable(train_loss, mssgd, reducer)

if global_rank == 0:
print('Training loss = %f, training accuracy = %f' %
(train_loss, train_correct /
(num_train_batch * batch_size * world_size)),
flush=True)

# Evaluation phase
model.eval()
for b in range(num_val_batch):
x = val_x[b * batch_size:(b + 1) * batch_size]
if model.dimension == 4:
if (image_size != model.input_size):
x = resize_dataset(x, model.input_size)
x = x.astype(np_dtype[precision])
y = val_y[b * batch_size:(b + 1) * batch_size]
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
out_test = model(tx)
test_correct += accuracy(tensor.to_numpy(out_test), y)

if DIST:
# Reduce the evaulation accuracy from multiple devices
test_correct = reduce_variable(test_correct, mssgd, reducer)

# Output the evaluation accuracy
if global_rank == 0:
print('Evaluation accuracy = %f, Elapsed Time = %fs' %
(test_correct / (num_val_batch * batch_size * world_size),
time.time() - start_time),
flush=True)

dev.PrintTimeProfiling()


if __name__ == '__main__':
# Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -271,4 +549,4 @@ def set_states(self, states):
mssgd,
args.graph,
args.verbosity,
precision=args.precision)
precision=args.precision)

0 comments on commit 33218e1

Please sign in to comment.