diff --git a/packages/features_importance.py b/packages/features_importance.py index f4ec9e8..4b7ba7e 100644 --- a/packages/features_importance.py +++ b/packages/features_importance.py @@ -5,58 +5,58 @@ import numpy as np import matplotlib as mpl -def plot_explanations(W, S_list, nb_subgroups, c, att_names, patt_descriptions) : +def plot_explanations(W, S_list, nb_subgroups, c, att_names, patt_descriptions): - for j in range(0, nb_subgroups) : + for j in range(nb_subgroups): print(j,'------------------------------------------------') - print(patt_descriptions[S_list[j]]) + print(patt_descriptions[S_list[j]]) coefficients = W[S_list[j]].coef_ logic = coefficients > 0 coefficients_abs = np.abs(coefficients) contributions = coefficients_abs / np.sum(coefficients_abs, axis = 1).reshape(-1,1) features_importance = contributions[c] * 100 limit = 0.75 - + f_importance = features_importance[features_importance > limit] f_importance = f_importance / np.sum(f_importance) * 100 f_importance = f_importance.round(2) att_names_ = list(pd.Series(att_names[:362])[features_importance > limit]) - + f_importance_1 = f_importance[logic[c][features_importance > limit]] att_names_1 = [x for i,x in enumerate (att_names_) if logic[c][features_importance > limit][i]] - + f_importance_2 = f_importance[~logic[c][features_importance > limit]] att_names_2 = [x for i,x in enumerate (att_names_) if not logic[c][features_importance > limit][i]] - + plt.style.use('fivethirtyeight') plt.figure(figsize =(3, 4)) plt.barh(att_names_2, f_importance_2,color='#e74c3c',height=0.65) - plt.barh(att_names_1, f_importance_1,color='#1abc9c',height=0.65) + plt.barh(att_names_1, f_importance_1,color='#1abc9c',height=0.65) all_f_importance = np.concatenate((f_importance_2,f_importance_1)) - for i, v in enumerate(all_f_importance) : - plt.text(v + 0.4, i, str(v)+'%', fontsize = 9) - + for i, v in enumerate(all_f_importance): + plt.text(v + 0.4, i, f'{str(v)}%', fontsize = 9) + plt.xlabel("Features Importance",fontsize=10) plt.xticks(fontsize=10) plt.yticks(fontsize=10) plt.grid(True, linestyle='--', which='major',color='grey', alpha=0.75) - plt.savefig('FIGURES/f_'+str(j)) + plt.savefig(f'FIGURES/f_{str(j)}') plt.show() -def sort_subgroups_support(S,K) : - S_copy = S.copy() - l_best_s = [] - for i in range(0,K) : - inter = 0 - s_best = None - - for s in S_copy : - if len(s) > inter : - inter = len(s) - s_best = s - l_best_s.append(s_best) - S_copy.remove(s_best) - - return l_best_s \ No newline at end of file +def sort_subgroups_support(S,K): + S_copy = S.copy() + l_best_s = [] + for _ in range(K): + inter = 0 + s_best = None + + for s in S_copy : + if len(s) > inter : + inter = len(s) + s_best = s + l_best_s.append(s_best) + S_copy.remove(s_best) + + return l_best_s \ No newline at end of file diff --git a/packages/neighbors_generation.py b/packages/neighbors_generation.py index 40a55e0..b98192c 100644 --- a/packages/neighbors_generation.py +++ b/packages/neighbors_generation.py @@ -15,26 +15,26 @@ def cal_covn(data_num, num_size, n) : return cov_matrix -def generate_all_neighbors (data, data_compressed, n_neigh, numerical_cols, numerical_cols_compressed, categ_unique, categ_unique_compressed,n_var, model) : +def generate_all_neighbors(data, data_compressed, n_neigh, numerical_cols, numerical_cols_compressed, categ_unique, categ_unique_compressed,n_var, model): list_neighs = [] num_size = numerical_cols.size num_size_compressed = numerical_cols_compressed.size - - n = np.size(data, 0) + + n = np.size(data, 0) covn = cal_covn(data, num_size, n_var) covn_compressed = cal_covn(data_compressed, num_size_compressed, n_var) - + base = np.zeros(data.shape[1]) neighbors_base = np.random.multivariate_normal(base, covn, n_neigh) - + base_compressed = np.zeros(data_compressed.shape[1]) neighbors_base_compressed = np.random.multivariate_normal(base_compressed, covn_compressed, n_neigh) - - for i in range(0,n) : + + for i in range(n): neighbors = neighbors_base + data[i] neighbors_compressed = neighbors_base_compressed + data_compressed[i] - + # for original neighbors j = num_size for l in categ_unique : @@ -47,7 +47,7 @@ def generate_all_neighbors (data, data_compressed, n_neigh, numerical_cols, nume neighbors[:,j][neighbors[:,j] <= 0] = 0 neighbors[:,j][neighbors[:,j] >= 1] = 1 j = j + 1 - + # for compressed neighbors k = num_size_compressed for l in categ_unique_compressed : @@ -60,10 +60,10 @@ def generate_all_neighbors (data, data_compressed, n_neigh, numerical_cols, nume neighbors_compressed[:,k][neighbors_compressed[:,k] <= 0] = 0 neighbors_compressed[:,k][neighbors_compressed[:,k] >= 1] = 1 k = k + 1 - + neighbors[neighbors < 0] = 0 neighbors_compressed [neighbors_compressed < 0] = 0 target = model.predict(neighbors) list_neighs.append((neighbors_compressed, target)) - + return list_neighs \ No newline at end of file diff --git a/packages/patterns_extraction.py b/packages/patterns_extraction.py index ced6053..e2782c2 100644 --- a/packages/patterns_extraction.py +++ b/packages/patterns_extraction.py @@ -4,17 +4,14 @@ from neighbors_generation import * -def patterns (P, split_point1, split_point2, data, att_names_) : +def patterns(P, split_point1, split_point2, data, att_names_): - patt_dict = dict() - rank = 0 - for s,p in P.items() : + patt_dict = {} + for rank, (s, p) in enumerate(P.items()): - description = '' - it = 0 - d = dict () - while (it < len(p)) : + d = {} + for it in range(0, len(p), 3): a,op,v = p[it],p[it+1],p[it+2] if a not in d : d[a] = [np.min(data[:,a]) , @@ -28,35 +25,40 @@ def patterns (P, split_point1, split_point2, data, att_names_) : #update le max d[a][1] = min(v,d[a][1]) - it += 3 - print ('subrgoup',rank) - + + description = '' for att, value in d.items(): - if att < split_point1 : + if ( + att >= split_point1 + and att < split_point2 + and value[1] == 0 + or att >= split_point1 + and att >= split_point2 + and value[0] < 0.5 + ): + print(att_names_[att],"=",'0') + description += att_names_[att] + ' = ' + '0' +' \n' + elif ( + att >= split_point1 + and att < split_point2 + or att >= split_point1 + ): + print(att_names_[att],"=",'1') + description += att_names_[att] + ' = ' + '1' +' \n' + + else: print(round(value[0]*23,0),"<",att_names_[att],"<=",round(value[1]*23,0)) - description += str(round(value[0]*23,0)) + ' < ' + att_names_[att] + ' <= ' + str(round(value[1]*23,0)) +' \n' - - elif att < split_point2 : - if value[1] == 0 : - print(att_names_[att],"=",'0') - description += att_names_[att] + ' = ' + '0' +' \n' - else : - print(att_names_[att],"=",'1') - description += att_names_[att] + ' = ' + '1' +' \n' - - else : - if value [0] < 0.5 : - print(att_names_[att],"=",'0') - description += att_names_[att] + ' = ' + '0' +' \n' - - else : - print(att_names_[att],"=",'1') - description += att_names_[att] + ' = ' + '1' +' \n' - - patt_dict[s] = description + description += ( + f'{str(round(value[0]*23,0))} < ' + + att_names_[att] + + ' <= ' + + str(round(value[1] * 23, 0)) + + ' \n' + ) + + + patt_dict[s] = description print("-------------------------------------------------------------------") - rank += 1 - return patt_dict diff --git a/packages/performances.py b/packages/performances.py index 6bf7231..d36c6fb 100644 --- a/packages/performances.py +++ b/packages/performances.py @@ -15,21 +15,20 @@ def loss_sd (S,data_test,list_neigh,model, limit) : loss += calc_loss(data_neigh_s, target_neigh_s_proba, limit) return loss -def loss_global_wb (data_test,list_neigh,model, limit) : +def loss_global_wb(data_test,list_neigh,model, limit): n = np.size(data_test,0) data_neigh_O, target_neigh_O_proba = sampling_sb(data_test,np.arange(n),list_neigh,model) - global_loss = calc_loss(data_neigh_O, target_neigh_O_proba, limit) - return global_loss + return calc_loss(data_neigh_O, target_neigh_O_proba, limit) -def loss_local_models (n,list_neigh,model, limit) : +def loss_local_models(n,list_neigh,model, limit): loss = 0 - for i in range(0,n) : + for i in range(n): data_neigh_i= list_neigh[i][0] target_neigh_i_proba = list_neigh[i][1] - loss += calc_loss(data_neigh_i, target_neigh_i_proba, limit) + loss += calc_loss(data_neigh_i, target_neigh_i_proba, limit) return loss def fscore_global_wb (data_test,n,list_neigh,model,nb_classes) : @@ -43,10 +42,9 @@ def fscore_global_wb (data_test,n,list_neigh,model,nb_classes) : return (f1_score(a[:,2],b[:,2],average='weighted'), f1_score(a[:,1],b[:,1],average='weighted'), f1_score(a[:,0],b[:,0],average='weighted')) -def fscore_sd (S,data_test,list_neigh,model,nb_classes) : +def fscore_sd(S,data_test,list_neigh,model,nb_classes): - iteration = 0 - for s in S : + for iteration, s in enumerate(S): data_neigh_s, target_neigh_s_proba = sampling_sb(data_test,s,list_neigh,model) lr = Ridge(alpha = 1) model_lr = lr.fit(data_neigh_s,target_neigh_s_proba) @@ -59,15 +57,12 @@ def fscore_sd (S,data_test,list_neigh,model,nb_classes) : a = np.concatenate((a,np.argsort(target_lr, axis=1)[:,-3:])) b = np.concatenate((b,np.argsort(target_neigh_s_proba, axis=1)[:,-3:])) - iteration += 1 - return (f1_score(a[:,2],b[:,2],average='weighted'), f1_score(a[:,1],b[:,1],average='weighted'), f1_score(a[:,0],b[:,0],average='weighted')) -def fscore_local_models (data_test,n,list_neigh,model,nb_classes) : +def fscore_local_models(data_test,n,list_neigh,model,nb_classes): - iteration = 0 - for i in range(0,n) : + for iteration, i in enumerate(range(n)): data_neigh_i= list_neigh[i][0] target_neigh_i_proba = list_neigh[i][1] @@ -80,9 +75,7 @@ def fscore_local_models (data_test,n,list_neigh,model,nb_classes) : else : a = np.concatenate((a,np.argsort(target_lr, axis=1)[:,-3:])) b = np.concatenate((b,np.argsort(target_neigh_i_proba, axis=1)[:,-3:])) - - iteration += 1 - + return (f1_score(a[:,2],b[:,2],average='weighted'), f1_score(a[:,1],b[:,1],average='weighted'), f1_score(a[:,0],b[:,0],average='weighted')) def unit_vector(vector): @@ -96,7 +89,7 @@ def angle_between(v1, v2): return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) -def similarity (W,nb_classes) : +def similarity(W,nb_classes): l = [] @@ -106,26 +99,26 @@ def similarity (W,nb_classes) : distance_matrix = np.zeros(len(l)**2).reshape(len(l),len(l)) - for i in range (0,len(l)) : + for i in range(len(l)): for j in range (i,len(l)): - for c in range (0,nb_classes) : + for c in range(nb_classes): if c == 0 : v1 = l[i][1].coef_[c] v2 = l[j][1].coef_[c] else : v1 = np.concatenate((v1,l[i][1].coef_[c]),axis=0) - v2 = np.concatenate((v2,l[j][1].coef_[c]),axis=0) + v2 = np.concatenate((v2,l[j][1].coef_[c]),axis=0) distance_matrix[i,j] = round(math.cos(angle_between(v1,v2)),2) distance_matrix[j,i] = distance_matrix[i,j] return distance_matrix -def avg_non_similar (dist,treshold) : +def avg_non_similar(dist,treshold): - nb_non_sim = 0 + nb_non_sim = 0 nb_sbgrps = np.size(dist,0) - for i in range (0, nb_sbgrps) : + for i in range(nb_sbgrps): for j in range (i+1, nb_sbgrps) : if dist[i,j] <= treshold : nb_non_sim += 1