Update end to end sim with log N values and add both known and estima…

…ted comm
IdeasLabUT · Jun 1, 2020 · dcf92ee · dcf92ee
1 parent cdbe8ab
commit dcf92ee
Showing 1 changed file with 152 additions and 160 deletions.
diff --git a/sim_end_to_end_count_based_estimates.py b/sim_end_to_end_count_based_estimates.py
@@ -7,23 +7,28 @@
 @author: Makan Arastuie
 """
 
+import os
+import copy
 import pickle
 import numpy as np
+from os.path import join
+from pathlib import Path
 import matplotlib.pyplot as plt
 from joblib import Parallel, delayed
 import generative_model_utils as utils
 import model_fitting_utils as model_utils
 from sklearn.metrics import adjusted_rand_score
 from spectral_clustering import spectral_cluster
 from parameter_estimation import estimate_hawkes_from_counts
+from sklearn.linear_model import LinearRegression
 
+result_file_path = join(os.sep, '/shared', 'Results', 'CommunityHawkes', 'pickles',
+                        'end_to_end_count_based_estimate_2', '7_values')
+Path(join(result_file_path, 'plots')).mkdir(mode=0o777, parents=True, exist_ok=True)
 
-result_file_path = '/shared/Results/CommunityHawkes/pickles/end_to_end_count_based_estimate'
-
-estimate_alpha_beta = True
-plot_only = False
-
-no_alpha_name = "_no_alpha" if not estimate_alpha_beta else ""
+run_analysis = True
+run_plotting = True
+run_regression = True
 
 # sim params
 end_time = 100
@@ -37,9 +42,11 @@
 beta_off_diag = 0.95
 
 class_probs = [0.25, 0.25, 0.25, 0.25]
-num_nodes_to_test = [8, 16, 32, 64, 128, 256]
+# num_nodes_to_test = [8, 16, 32, 64, 128, 256]
 # num_nodes_to_test = [16, 32, 64, 128, 256, 512]
-num_simulations = 100
+num_nodes_to_test = np.logspace(4, 7, num=7, dtype=np.int32, base=2)
+# num_nodes_to_test = np.logspace(4, 5, num=4, dtype=np.int32, base=2)
+num_simulations = 6
 n_cores = 6
 n_classes = len(class_probs)
 
@@ -64,20 +71,18 @@ def calc_mean_and_error_of_count_estiamte(n_nodes):
     sc_rand = adjusted_rand_score(true_node_membership, node_membership)
     sc_rand = np.zeros((n_classes, n_classes)) + sc_rand  # match the shape of other params to retrieve easily
 
-    if estimate_alpha_beta:
-        bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(event_dict,
-                                                                                              node_membership,
-                                                                                              end_time,
-                                                                                              n_classes)
-        return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta, sc_rand
-
-    agg_adj = utils.event_dict_to_aggregated_adjacency(n_nodes, event_dict)
-    bp_mu, bp_alpha_beta_ratio = estimate_hawkes_from_counts(agg_adj, node_membership, end_time, 1e-10 / end_time)
+    # param estimation with estimated communities
+    bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(event_dict,
+                                                                                          node_membership,
+                                                                                          end_time,
+                                                                                          n_classes)
+    # param estimation with known communities. k_ is for known_
+    k_bp_mu, k_bp_alpha, k_bp_beta, k_bp_alpha_beta_ratio = model_utils.estimate_bp_hawkes_params(event_dict,
+                                                                                                  true_node_membership,
+                                                                                                  end_time,
+                                                                                                  n_classes)
+    return bp_mu, bp_alpha_beta_ratio, bp_alpha, bp_beta, sc_rand, k_bp_mu, k_bp_alpha_beta_ratio, k_bp_alpha, k_bp_beta
 
-    return bp_mu, bp_alpha_beta_ratio, sc_rand
-
-
-no_alpha_name = "_no_alpha" if not estimate_alpha_beta else ""
 
 true_mu = np.zeros((n_classes, n_classes)) + mu_off_diag
 np.fill_diagonal(true_mu, mu_diag)
@@ -90,18 +95,25 @@ def calc_mean_and_error_of_count_estiamte(n_nodes):
 
 true_ratio = true_alpha / true_beta
 
-rand_mean = []
-rand_mean_err = []
-mu_mse = []
-mu_mse_err = []
-ratio_mse = []
-ratio_mse_err = []
-
-alpha_mse = []
-alpha_mse_err = []
-beta_mse = []
-beta_mse_err = []
-if not plot_only:
+params = {
+    'mu': ['mu_mse', 'mu_mse_err'],
+    'm': ['ratio_mse', 'ratio_mse_err'],
+    'alpha': ['alpha_mse', 'alpha_mse_err'],
+    'beta': ['beta_mse', 'beta_mse_err']
+}
+
+# known_communities_error
+kce = {}
+for param, err in params.items():
+    kce[err[0]] = []
+    kce[err[1]] = []
+
+# estimated_communities_error
+ece = copy.deepcopy(kce)
+ece['rand_mean'] = []
+ece['rand_mean_err'] = []
+
+if run_analysis:
     for j, n_nodes in enumerate(num_nodes_to_test):
         results = Parallel(n_jobs=n_cores)(delayed(calc_mean_and_error_of_count_estiamte)
                                            (n_nodes) for i in range(num_simulations))
@@ -110,146 +122,126 @@ def calc_mean_and_error_of_count_estiamte(n_nodes):
 
         results = np.asarray(results, dtype=np.float)
 
-        if estimate_alpha_beta:
-            results = np.reshape(results, (num_simulations, 5, n_classes, n_classes))
-        else:
-            results = np.reshape(results, (num_simulations, 3, n_classes, n_classes))
+        results = np.reshape(results, (num_simulations, 9, n_classes, n_classes))
 
+        # estimated communities
         mu_mse_temp = np.power(results[:, 0, :, :] - true_mu, 2)
-        mu_mse.append(np.mean(mu_mse_temp))
-        mu_mse_err.append(2 * np.std(mu_mse_temp) / np.sqrt(len(mu_mse_temp)))
+        ece['mu_mse'].append(np.mean(mu_mse_temp))
+        ece['mu_mse_err'].append(2 * np.std(mu_mse_temp) / np.sqrt(mu_mse_temp.size))
 
         ratio_mse_temp = np.power(results[:, 1, :, :] - true_ratio, 2)
-        ratio_mse.append(np.mean(ratio_mse_temp))
-        ratio_mse_err.append(2 * np.std(ratio_mse_temp) / np.sqrt(ratio_mse_temp.size))
+        ece['ratio_mse'].append(np.mean(ratio_mse_temp))
+        ece['ratio_mse_err'].append(2 * np.std(ratio_mse_temp) / np.sqrt(ratio_mse_temp.size))
+
+        alpha_mse_temp = np.power(results[:, 2, :, :] - true_alpha, 2)
+        ece['alpha_mse'].append(np.mean(alpha_mse_temp))
+        ece['alpha_mse_err'].append(2 * np.std(alpha_mse_temp) / np.sqrt(alpha_mse_temp.size))
+
+        beta_mse_temp = np.power(results[:, 3, :, :] - true_beta, 2)
+        ece['beta_mse'].append(np.mean(beta_mse_temp))
+        ece['beta_mse_err'].append(2 * np.std(beta_mse_temp) / np.sqrt(beta_mse_temp.size))
 
         rand_mean_temp = np.mean(results[:, 4, 0, 0])
-        rand_mean.append(rand_mean_temp)
-        rand_mean_err.append(2 * np.std(results[:, 4, 0, 0]) / np.sqrt(results[:, 4, 0, 0].size))
-
-        if estimate_alpha_beta:
-            alpha_mse_temp = np.power(results[:, 2, :, :] - true_alpha, 2)
-            alpha_mse.append(np.mean(alpha_mse_temp))
-            alpha_mse_err.append(2 * np.std(alpha_mse_temp) / np.sqrt(alpha_mse_temp.size))
-
-            beta_mse_temp = np.power(results[:, 3, :, :] - true_beta, 2)
-            beta_mse.append(np.mean(beta_mse_temp))
-            beta_mse_err.append(2 * np.std(beta_mse_temp) / np.sqrt(beta_mse_temp.size))
-
-    if estimate_alpha_beta:
-        with open(f'{result_file_path}/mses.pckl', 'wb') as handle:
-            pickle.dump([rand_mean, rand_mean_err, mu_mse, mu_mse_err, ratio_mse, ratio_mse_err,
-                         alpha_mse, alpha_mse_err, beta_mse, beta_mse_err], handle, protocol=pickle.HIGHEST_PROTOCOL)
-    else:
-        with open(f'{result_file_path}/mses_no_alpha.pckl', 'wb') as handle:
-            pickle.dump([rand_mean, rand_mean_err, mu_mse, mu_mse_err,
-                         ratio_mse, ratio_mse_err], handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-
-if estimate_alpha_beta:
-    with open(f'{result_file_path}/mses.pckl', 'rb') as handle:
-        [rand_mean, rand_mean_err, mu_mse, mu_mse_err, ratio_mse, ratio_mse_err,
-         alpha_mse, alpha_mse_err, beta_mse, beta_mse_err] = pickle.load(handle)
-else:
-    with open(f'{result_file_path}/mses_no_alpha.pckl', 'rb') as handle:
-        rand_mean, rand_mean_err, mu_mse, mu_mse_err, ratio_mse, ratio_mse_err = pickle.load(handle)
-
-
-print("Rand Mean:")
-print(rand_mean)
-
-print("Mu MSE:")
-print(mu_mse)
-
-print("\nRatio MSE:")
-print(ratio_mse)
-
-if estimate_alpha_beta:
-    print("\nAlpha MSE:")
-    print(alpha_mse)
-
-    print("\nBeta MSE:")
-    print(beta_mse)
-
-
-# plt.title("MSE of Mu estimate using count matrix")
-plt.ion()
-plt.subplots(figsize=(3.8, 3))
-plt.bar(range(len(num_nodes_to_test)), mu_mse, yerr=mu_mse_err, log=True)
-plt.xlabel("Number of Nodes", fontsize=16)
-plt.ylabel("Mean-squared Error", fontsize=16)
-plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
-plt.tick_params(labelsize=12)
-plt.tight_layout()
-#plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), fontsize=16)
-#plt.autoscale()
-plt.savefig(f"{result_file_path}/plots/consistent_mu_mse.pdf")
-#plt.show()
-
-#plt.clf()
-
-# plt.title("MSE of m estimate using count matrix")
-plt.subplots(figsize=(3.8, 3))
-plt.bar(range(len(num_nodes_to_test)), ratio_mse, yerr=ratio_mse_err, log=True)
-plt.xlabel("Number of Nodes", fontsize=16)
-plt.ylabel("Mean-squared Error", fontsize=16)
-plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
-plt.tick_params(labelsize=12)
-plt.tight_layout()
-#plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), fontsize=16)
-#plt.autoscale()
-plt.savefig(f"{result_file_path}/plots/consistent_m_mse.pdf")
-#plt.show()
-
-#plt.clf()
-
-
-plt.subplots(figsize=(3.8, 3))
-plt.bar(range(len(num_nodes_to_test)), rand_mean, yerr=rand_mean_err)
-# plt.title("MSE of beta estimate using count matrix")
-plt.xlabel("Number of Nodes", fontsize=16)
-plt.ylabel("Mean Adjusted Rand Score", fontsize=16)
-
-plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
-plt.tick_params(labelsize=12)
-plt.tight_layout()
-# plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), fontsize=16)
-
-plt.savefig(f"{result_file_path}/plots/consistent_rand_mean.pdf")
-# plt.show()
-
-
-if estimate_alpha_beta:
-    plt.subplots(figsize=(3.8, 3))
-    plt.bar(range(len(num_nodes_to_test)), alpha_mse, yerr=alpha_mse_err, log=True)
-    # plt.title("MSE of alpha estimate using count matrix")
-    plt.xlabel("Number of Nodes", fontsize=16)
-    plt.ylabel("Mean-squared Error", fontsize=16)
+        ece['rand_mean'].append(rand_mean_temp)
+        ece['rand_mean_err'].append(2 * np.std(results[:, 4, 0, 0]) / np.sqrt(results[:, 4, 0, 0].size))
 
-    plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
-    plt.tick_params(labelsize=12)
-    plt.tight_layout()
-    #plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), fontsize=16)
+        # known communities
+        mu_mse_temp = np.power(results[:, 5, :, :] - true_mu, 2)
+        kce['mu_mse'].append(np.mean(mu_mse_temp))
+        kce['mu_mse_err'].append(2 * np.std(mu_mse_temp) / np.sqrt(mu_mse_temp.size))
 
-    plt.savefig(f"{result_file_path}/plots/consistent_alpha_mse.pdf")
-    #plt.show()
+        ratio_mse_temp = np.power(results[:, 6, :, :] - true_ratio, 2)
+        kce['ratio_mse'].append(np.mean(ratio_mse_temp))
+        kce['ratio_mse_err'].append(2 * np.std(ratio_mse_temp) / np.sqrt(ratio_mse_temp.size))
 
-    #plt.clf()
+        alpha_mse_temp = np.power(results[:, 7, :, :] - true_alpha, 2)
+        kce['alpha_mse'].append(np.mean(alpha_mse_temp))
+        kce['alpha_mse_err'].append(2 * np.std(alpha_mse_temp) / np.sqrt(alpha_mse_temp.size))
 
+        beta_mse_temp = np.power(results[:, 8, :, :] - true_beta, 2)
+        kce['beta_mse'].append(np.mean(beta_mse_temp))
+        kce['beta_mse_err'].append(2 * np.std(beta_mse_temp) / np.sqrt(beta_mse_temp.size))
+
+    with open(f'{result_file_path}/mses.pckl', 'wb') as handle:
+        pickle.dump([ece, kce], handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+
+with open(f'{result_file_path}/mses.pckl', 'rb') as handle:
+    ece, kce = pickle.load(handle)
+
+
+print("Estimated communities:")
+print('Rand mean:')
+print(ece['rand_mean'])
+for param, err in params.items():
+    print(param, "MSE:")
+    print(ece[err[0]])
+
+print("\nKnown communities:")
+for param, err in params.items():
+    print(param, "MSE:")
+    print(kce[err[0]])
+
+
+if run_plotting:
+    # rand index for estimated communities
+    plt.ion()
     plt.subplots(figsize=(3.8, 3))
-    plt.bar(range(len(num_nodes_to_test)), beta_mse, yerr=beta_mse_err, log=True)
-    # plt.title("MSE of beta estimate using count matrix")
+    plt.bar(range(len(num_nodes_to_test)), ece['rand_mean'], yerr=ece['rand_mean_err'], log=True)
     plt.xlabel("Number of Nodes", fontsize=16)
     plt.ylabel("Mean-squared Error", fontsize=16)
-
     plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
     plt.tick_params(labelsize=12)
     plt.tight_layout()
-    #plt.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), fontsize=16)
-
-    plt.savefig(f"{result_file_path}/plots/consistent_beta_mse.pdf")
-    #plt.show()
-
-
-
-# [0.6320759231955095, 0.9677787392115078, 1.0, 1.0, 1.0, 1.0]
+    plt.savefig(f"{result_file_path}/plots/known_consistent_rand_mean.pdf")
+
+    for param, err in params.items():
+        # estimated communities
+        mse, mse_err = err[0], err[1]
+        plt.ion()
+        plt.subplots(figsize=(3.8, 3))
+        plt.bar(range(len(num_nodes_to_test)), ece[mse], yerr=ece[mse_err], log=True)
+        plt.xlabel("Number of Nodes", fontsize=16)
+        plt.ylabel("Mean-squared Error", fontsize=16)
+        plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
+        plt.tick_params(labelsize=12)
+        plt.tight_layout()
+        plt.savefig(f"{result_file_path}/plots/estimated_consistent_{param}_mse.pdf")
+
+        # known communities
+        plt.ion()
+        plt.subplots(figsize=(3.8, 3))
+        plt.bar(range(len(num_nodes_to_test)), kce[mse], yerr=kce[mse_err], log=True)
+        plt.xlabel("Number of Nodes", fontsize=16)
+        plt.ylabel("Mean-squared Error", fontsize=16)
+        plt.xticks(range(len(num_nodes_to_test)), num_nodes_to_test)
+        plt.tick_params(labelsize=12)
+        plt.tight_layout()
+        plt.savefig(f"{result_file_path}/plots/known_consistent_{param}_mse.pdf")
+
+
+if run_regression:
+    print("\nRegression: \n")
+    print("Estimated Communities:\n")
+    for param, err in params.items():
+        print(param, '(estimated communities)')
+        x = np.log(num_nodes_to_test).reshape(len(num_nodes_to_test), 1)
+        y = np.log(ece[err[0]]).reshape(len(ece[err[0]]), 1)
+
+        reg = LinearRegression().fit(x, y)
+
+        print("R^2:", reg.score(x, y))
+        print("coef:", reg.coef_)
+        print('intercept:', reg.intercept_)
+        print()
+
+        print(param, '(known communities)')
+        x = np.log(num_nodes_to_test).reshape(len(num_nodes_to_test), 1)
+        y = np.log(kce[err[0]]).reshape(len(kce[err[0]]), 1)
+
+        reg = LinearRegression().fit(x, y)
+
+        print("R^2:", reg.score(x, y))
+        print("coef:", reg.coef_)
+        print('intercept:', reg.intercept_)
+        print()