fusion_gm.py

import  torch, \
        ground_metric_gm    as gm, \
        gurobi_qap          as gb,  \
        model_gm            as model


def total_node_num( network:torch.nn.Module ):
    '''
    count the total number of nodes in the network [network]
    '''
    num_nodes = 0
    for idx, (name, parameters) in enumerate( network.named_parameters() ):
        if 'bias' in name:
            continue
        if idx == 0:
            num_nodes += parameters.shape[1]
        num_nodes += parameters.shape[0]
    return num_nodes


def graph_matching_fusion( args, networks:list ):
    '''
    the function use graph matching technique to align each layer in networks[0] along with
        networks[1], and return a list that contains the averaged aligned parameters, following
        the original order of parameters in model.parameters()

    the averaging weights are specified in [args.ensemble_step, 1-args.ensemble_step]
    '''
    '''
    count the number of nodes in network[0] and network[1], and store them
        as [n1] and [n2], respectively
    '''
    n1 = total_node_num( network=networks[0] )
    n2 = total_node_num( network=networks[1] )
    assert( n1 == n2 )
    '''
    define affinity matrix
    '''
    affinity = torch.zeros([ n1 * n2, n1 * n2 ])
    '''
    iterate through all the layers to calculate the pair-wise distances / affinities
    suppose the layer node numbers are:
        N1(inputs), N2, ..., N(l-1), Nl(outputs), then
    [num_nodes_incremental] = [ N1,             N1+N2,          ..., N1+N2+...+N(l-1)     ]
    [num_nodes_layers]      = [ N2,             N3,             ..., Nl                   ]
    [pre_conv_list]         = [ conv(layer1~2), conv(layer2~3), ..., conv(layer(l-1~l)) ]
        it does not contain bias layers
    [conv_kernel_size_list] = [ kernel_size(1), ..., kernel_size(l) ]
    '''
    num_layers = len( list( zip( networks[0].parameters(), networks[1].parameters() ) ) )
    num_nodes_before = 0
    num_nodes_incremental = []
    num_nodes_layers = []
    pre_conv_list = []
    conv_kernel_size_list = []
    num_nodes_pre = 0
    num_nodes_cur = 0
    is_conv = False
    pre_conv = False
    pre_conv_kernel_size = None
    pre_conv_out_channel = 1
    is_bias = False
    is_final_bias = False
    pre_bias = False
    perm_is_complete = True

    named_weight_list_0 = [named_parameter for named_parameter in networks[0].named_parameters()]
    for idx, ( (_, fc_layer0_weight), (_, fc_layer1_weight) ) in \
            enumerate( zip( networks[0].named_parameters(), networks[1].named_parameters() ) ):
        assert fc_layer0_weight.shape == fc_layer1_weight.shape
        layer_shape = fc_layer0_weight.shape
        num_nodes_cur = fc_layer0_weight.shape[0]
        if len( layer_shape ) > 1:
            # if it's a fully-connected layer after a convolutional layer
            if pre_conv is True and len( layer_shape ) == 2:
                num_nodes_pre = pre_conv_out_channel
            else:
                num_nodes_pre = fc_layer0_weight.shape[1]
        '''
        tell whether the layer is convolutional or fully-connected or bias
        '''
        # if is_bias is False:
        #     pre_conv = is_conv
        #     pre_conv_list.append( pre_conv )
        if idx >= 1:
            if len( named_weight_list_0[idx-1][1].shape ) == 1:
                pre_bias = True
        else:
            pre_bias = False
        if len( layer_shape ) > 2:
            is_bias = False
            if pre_bias == False:
                pre_conv = is_conv
                pre_conv_list.append( True )
            is_conv = True
            # For convolutional layers, it is (#out_channels, #in_channels, height, width)
            fc_layer0_weight_data = fc_layer0_weight.data.view(
                fc_layer0_weight.shape[0], fc_layer0_weight.shape[1], -1)
            fc_layer1_weight_data = fc_layer1_weight.data.view(
                fc_layer1_weight.shape[0], fc_layer1_weight.shape[1], -1)
        elif len( layer_shape ) == 2:
            is_bias = False
            if pre_bias == False:
                pre_conv = is_conv
                pre_conv_list.append( False )
            is_conv = False
            fc_layer0_weight_data = fc_layer0_weight.data
            fc_layer1_weight_data = fc_layer1_weight.data
        else:
            is_bias = True
            if pre_bias == False:
                pre_conv = is_conv
                pre_conv_list.append( False )
            is_conv = False
            fc_layer0_weight_data = fc_layer0_weight.data
            fc_layer1_weight_data = fc_layer1_weight.data
        '''
        if it's conv, update [pre_conv_out_channel]
        '''
        if is_conv:
            pre_conv_out_channel = num_nodes_cur
        '''
        tell whether it's the final bias layer
        '''
        if is_bias is True and idx == num_layers - 1:
            is_final_bias = True
        '''
        if it's the first layer, map the input nodes
        '''
        if idx == 0:
            for a in range( num_nodes_pre ):
                affinity[(num_nodes_before + a) * n2 + num_nodes_before + a] \
                        [(num_nodes_before + a) * n2 + num_nodes_before + a] \
                = 1
        '''
        if it's the final layer, map the output nodes
        '''
        if  idx == num_layers - 2 and 'bias' in named_weight_list_0[idx+1][0] or \
            idx == num_layers - 1 and 'bias' not in named_weight_list_0[idx][0]:
            for a in range( num_nodes_cur ):
                affinity[(num_nodes_before + num_nodes_pre + a) * n2 + num_nodes_before + num_nodes_pre + a] \
                        [(num_nodes_before + num_nodes_pre + a) * n2 + num_nodes_before + num_nodes_pre + a] \
                = 1
        '''
        calculate the edge-wise soft affinities between two models
        '''
        if is_bias is False:
            ground_metric = gm.Ground_Metric_GM( 
                fc_layer0_weight_data, fc_layer1_weight_data, is_conv, is_bias,
                pre_conv, int( fc_layer0_weight_data.shape[1] / pre_conv_out_channel ) )
        else:
            ground_metric = gm.Ground_Metric_GM( 
                fc_layer0_weight_data, fc_layer1_weight_data, is_conv, is_bias,
                pre_conv, 1 )
            
        layer_affinity = ground_metric.process_soft_affinity( p=2 )
        # print( f'is_conf = {is_conv}, fc layer shape is {fc_layer0_weight.shape}' )
        if is_bias is False:
            pre_conv_kernel_size = fc_layer0_weight.shape[3] if is_conv else None
            conv_kernel_size_list.append( pre_conv_kernel_size )
        '''
        copy the affinity values from [layer_affinity] to the corresponding positions
            in [affinity] matrix
        '''
        if is_bias is True and is_final_bias is False:
            for a in range( num_nodes_cur ):
                for c in range( num_nodes_cur ):
                    affinity[(num_nodes_before + a) * n2 + num_nodes_before + c] \
                            [(num_nodes_before + a) * n2 + num_nodes_before + c] \
                    = layer_affinity[a][c]
        elif is_final_bias is False:
            for a in range( num_nodes_pre ):
                for b in range( num_nodes_cur ):
                    affinity[ 
                        (num_nodes_before + a) * n2 + num_nodes_before :
                        (num_nodes_before + a) * n2 + num_nodes_before + num_nodes_pre,
                        (num_nodes_before + num_nodes_pre + b) * n2 + num_nodes_before + num_nodes_pre :
                        (num_nodes_before + num_nodes_pre + b) * n2 + num_nodes_before + num_nodes_pre + num_nodes_cur ] \
                    = layer_affinity[a + b * num_nodes_pre].view( num_nodes_cur, num_nodes_pre ).transpose( 0, 1 )
        '''
        update the total number of nodes that has already been considered in previous steps
        '''
        if is_bias is False:
            num_nodes_before += num_nodes_pre
            num_nodes_incremental.append( num_nodes_before )
            num_nodes_layers.append( num_nodes_cur )

    '''
    solve the quadratic assignment problem by calling gurobipy package
    '''
    solution = gb.gurobi_qap_solver( affinity, n1, n2, time_limit=300 )
    
    # debug block begin (uncomment and unindent the following to debug)
        # torch. set_printoptions(profile="full")
        # print( f'affinity matrix is \n{affinity}' )
        # print( f'solution is \n{solution}' )
        # torch. set_printoptions(profile="default")
        # return 
    # debug block end
    '''
    perform the alignment to network[0] according to the solution

    [idx] represents the index of layers, including 'bias' layers


    '''
    aligned_wt_0 = [parameter.data for name, parameter in named_weight_list_0]
    idx = 0
    num_layers = len( aligned_wt_0 )
    '''
    for each iteration, the weight matrix between two layers (e.g. L_i and L_{i+1}) are considered
        [num_before] denotes N_1 + N_2 + ... + N_i
        [num_cur] denotes N_{i+1}
        [pre_conv] denotes whether weights between L_i and L_{i+1} is convolutional
        [cur_kernel_size] denotes the kenrel_size of the current weight matrix

    for each iteration, 
        1. align the weights between L_{i-1} and L_i
        2. align the bias on L_i (if bias exists)
        3. align the weights between L_i and L_{i+1}
    '''
    for num_before, num_cur, pre_conv, cur_kernel_size in \
        zip(num_nodes_incremental, num_nodes_layers, pre_conv_list, conv_kernel_size_list):
        '''
        obtain permutation matrix according to the solution

        some preliminaries about permutation matrix:
            1.  firstly, we define a permuation function Pi: {1,...,M} --> {1,...,M}, so that
                1 is mapped to Pi(1), 2 is mapped to Pi(2), ..., M is mapped to Pi(M).
            2.  Then, we construct the corresponding M x M permutation matrix Perm by:
                Perm[i, j] = 1 if j == Pi(i) else 0
            3.  if we have a N x M matrix A, and we derive B = A @ Perm, then
                the [i]th column of A would become the [Pi(i)]th column of B
            4.  if we have a M x N matrix C, and we derive D = perm^T @ C, then
                the [i]th row of C would become the [Pi(i)]th row of D
        
        some structural information of the returned solution [solution]:
            1.  for the [i]th layer with ni nodes, and Ni nodes before,
                solution[Ni + a][Ni + b] = 1 if a is mapped to b else 0
            2.  if we define Perm_i = solution[Ni:Ni+ni][Ni:Ni+ni], then Perm_i is the 
                permutation matrix corresponding to the permutation function Pi, where
                the [i]th node in model 1 is mapped to [Pi(i)]th node in model 2
        
        the procedure to permutate the parameters:
            1.  given the permutation matrix upon layer i, permutate the columns of parameters
                between layer i and layer i+1
            2.  given the permutation matrix upon layer i, permutate the rows of parameters
                between layer i-1 and layer i
        '''
        # perm = solution[num_before:num_before+num_cur, num_before:num_before+num_cur]
        perm = torch.diag( torch.ones( num_cur ) )
        if torch.sum( perm ).item() != perm.shape[0]:
            perm_is_complete = False
        '''
        permutate the rows of parameters between previous layer and current layer
        if the current layer is convolutional:
            1.  permute the aligned weight by: 2-->0, 3-->1, 0-->2, 1-->3
            2.  multiply with the transpose of permutation matrix
            3.  restore the permutation
        else:
            directly multiply with the permutation matrix
        for detailed explanation for the operator '@', or __matmul__, or infix
            multiplication between matrices, see the link:
            https://www.python.org/dev/peps/pep-0465/
        '''
        assert 'bias' not in named_weight_list_0[idx][0]
        if len( named_weight_list_0[idx][1].shape ) == 4:
            aligned_wt_0[idx] = (perm.transpose(0,1).to(torch.float64) @ \
                aligned_wt_0[idx].to(torch.float64).permute(2,3,0,1)) \
                .permute(2,3,0,1)
        else:
            aligned_wt_0[idx] = perm.transpose(0,1).to(torch.float64) @ aligned_wt_0[idx].to(torch.float64)
        idx += 1
        '''
        if the bias layer is present, then permuate the bias layer
        '''
        if idx >= num_layers:
            continue
        if 'bias' in named_weight_list_0[idx][0]:
            aligned_wt_0[idx] = aligned_wt_0[idx].to(torch.float64) @ perm.to(torch.float64)
            idx += 1
        '''
        permutate the columns of parameters between current layer and the next layer
        if the previous layer is convolutional and the current layer is fully-connected:
            1.  reshape the aligned weight to 
                [cur_num] x [pre_num / kernel_size_squared] x [kernel_size_squared]
            2.  permute the aligned weight so that dim 1 and dim 2 are switched
            3.  multiply the permutation matrix
            4.  permute the aligned weight so that dim 1 and dim 2 are restored
            5.  restore the shape of the aligned weight back to
                [cur_num] x [pre_num]
        else:
            directly multiply the permutation matrix
        '''
        if idx >= num_layers:
            continue
        if pre_conv and len( named_weight_list_0[idx][1].shape ) == 2:
            aligned_wt_0[idx] = ( aligned_wt_0[idx].to(torch.float64) \
                .reshape( aligned_wt_0[idx].shape[0], pre_conv_out_channel, -1 ) \
                .permute( 0, 2, 1 ) \
                @ perm.to(torch.float64) ) \
                .permute( 0, 2, 1 ) \
                .reshape( aligned_wt_0[idx].shape[0], -1 )
        elif len( named_weight_list_0[idx][1].shape ) == 4:
            aligned_wt_0[idx] = ( aligned_wt_0[idx].to(torch.float64) \
                .permute( 2, 3, 0, 1 ) \
                @ perm.to(torch.float64) ) \
                .permute( 2, 3, 0, 1 )
        else:
            aligned_wt_0[idx] = aligned_wt_0[idx].to(torch.float64) @ perm.to(torch.float64)
    assert idx == num_layers

    # debug block begin
    # for aligned_wt, (name, parameter) in zip( aligned_wt_0, networks[0].named_parameters() ):
    #     print( f'*the original weights named "{name}" are \n{parameter}\n*and the aligned \
    #         weights are \n{aligned_wt}' )
    # debug block end
    '''
    average the parameters of model 1 and model 2 according to the weights given by [args.ensemble_step, 1-args.ensemble_step], 
        then store the results in a list, and return the list
    '''
    averaged_weights = []
    for idx, parameter in enumerate( networks[1].parameters() ):
        averaged_weights.append( (1 - args.ensemble_step) * aligned_wt_0[idx] + args.ensemble_step * parameter )
    return averaged_weights, perm_is_complete


def get_fused_model( args, networks:list ):
    '''
    the input [parameters] is a list consisting of tensors
    '''
    parameters, perm_is_complete = graph_matching_fusion( args, networks )
    fused_model = model.get_model_from_name( args )
    state_dict = fused_model.state_dict()
    for idx, (key, _) in enumerate( state_dict.items() ):
        state_dict[key] = parameters[idx]
    fused_model.load_state_dict( state_dict )
    return fused_model, perm_is_complete


if __name__ == "__main__":
    import torch.nn as nn
    import torch.nn.functional as F
    class dotdict(dict):
        """ dot.notation access to dictionary attributes """
        __getattr__ = dict.get
        __setattr__ = dict.__setitem__
        __delattr__ = dict.__delitem__
    args = dotdict( {
        "weight": [0.5, 0.5],
        "model_name": "naivenet",
        "dataset": "mnist",
        "disable_bias": False,
        "width_ratio": 1,
        "num_hidden_nodes1": 20,
        "num_hidden_nodes2": 30,
        "num_hidden_nodes3": 10,
        "ensemble_step": 0.5
    } )
    '''
    define a very naive nueral network for testing purpose
    '''
    model1 = model.naive_net()
    model2 = model.naive_net()
    '''
    create two state_dict() instances to initialize two networks
    '''
    state_dict1 = {
        'lin1.weight':  torch.tensor([[1,2], [7,8], [4,5]]),
        'lin1.bias':    torch.tensor([5,6,7]),
        'lin2.weight':  torch.tensor([[1,2,3], [7,8,9]]),
        'lin2.bias':    torch.tensor([4,5]) }
    state_dict2 = {
        'lin1.weight':  torch.tensor([[2,1], [4,4], [7,7]]),
        'lin1.bias':    torch.tensor([4,8,6]),
        'lin2.weight':  torch.tensor([[8,7,9], [2,1,3]]),
        'lin2.bias':    torch.tensor([6,3]) }
    model1.load_state_dict( state_dict1 )
    model2.load_state_dict( state_dict2 )
    '''
    print two models to see that they are created as we wishes
    '''
    def print_model( model:nn.Module ):
        for name, parameter in model.named_parameters():
            print( f'name is {name},\t parameter is \n\t{parameter}' )
    # print( model1 )
    # print_model( model1 )
    # print( model2 )
    # print_model( model2 )
    # print('##########################################################')
    '''
    call the fusion function to check the affinity matrix and the solution
    '''
    # print( graph_matching_fusion( args, [model1, model2] ) )
    # print( get_fused_model( args, [model1, model2] ) )
    # print('##########################################################')


    # print('##########################################################')
    print( '------------------- gm-based with perm = diagonal -------------------' )
    '''
    define a simple convolutional neural network
    '''
    args.model_name = 'naivecnn'
    model3 = model.naive_cnn()
    model4 = model.naive_cnn()
    '''
    create two state_dict() instances to initialize two networks
    '''
    state_dict3 = {
        'conv1.weight': torch.tensor([[ [[1,2],[3,4]] ], [ [[5,6],[7,8]] ]]),
        'conv1.bias':   torch.tensor([5, 6]),
        'fc1.weight':   torch.tensor([[1,2,3,4,5,6,7,8], [8,7,6,5,4,3,2,1]]),
        'fc1.bias':     torch.tensor([1,2]) }
    state_dict4 = {
        'conv1.weight': torch.tensor([[ [[5,6],[8,7]] ], [ [[2,1],[3,4]] ]]),
        'conv1.bias':   torch.tensor([7, 4]),
        'fc1.weight':   torch.tensor([[3,4,1,2,7,8,5,6], [5,7,6,8,1,3,2,4]]),
        'fc1.bias':     torch.tensor([3,1]) }
    model3.load_state_dict( state_dict3 )
    model4.load_state_dict( state_dict4 )
    '''
    print two models to see that they are created as we wishes
    '''
    # print( model3 )
    # print_model( model3 )
    # print( model4 )
    # print_model( model4 )
    '''
    call the fusion function to check the affinity matrix and the solution
    '''
    # print('##########################################################')
    # print( graph_matching_fusion( args, [model3, model4] ) )
    print_model( get_fused_model( args, [model3, model4] )[0] )

    print( '------------------- naive fusion -------------------' )
    fused_model = model.naive_cnn()
    state_dict_fused = {}
    for key, value in state_dict3.items():
        state_dict_fused[key] = ( state_dict3[key] + state_dict4[key] ) / 2
    fused_model.load_state_dict( state_dict_fused )
    print_model( fused_model )