Before code clean up

IdeasLabUT · Jun 11, 2020 · f56bbcf · f56bbcf
1 parent 8ae5df1
commit f56bbcf
Show file tree

Hide file tree

Showing 11 changed files with 640 additions and 33 deletions.
diff --git a/bhm_model_fitting.py b/bhm_model_fitting.py
@@ -105,22 +105,33 @@ def fit_and_eval_block_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not
 
 # Running Block Hawkes model on Facebook, Enron, Reality Mining, and simulated data
 if __name__ == "__main__":
-    # Entire Facebook Dataset
-    print("Entire Facebook wall-post dataset")
-    fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
-        dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8)
-    fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
-                              local_search_max_iter=500, local_search_n_cores=25,
-                              k_values_to_test=[1],
-                              plot_fitted_hist=False, verbose=False)
+    pass
+    # # Entire Facebook Dataset
+    # print("Entire Facebook wall-post dataset 2")
+    # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
+    #     dataset_utils.load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=True,
+    #                                        train_percentage=0.8, remove_nodes_not_in_train=True)
+    # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
+    #                           local_search_max_iter=500, local_search_n_cores=25,
+    #                           k_values_to_test=[1],
+    #                           plot_fitted_hist=False, verbose=False)
+
+    # # Entire Facebook Dataset
+    # print("Facebook wall-post dataset")
+    # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
+    #     dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8)
+    # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
+    #                           local_search_max_iter=500, local_search_n_cores=25,
+    #                           k_values_to_test=[1],
+    #                           plot_fitted_hist=False, verbose=False)
 
     # # Facebook Dataset
     # print("Facebook wall-post dataset")
     # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
-    #     dataset_utils.load_fb_train_test(remove_nodes_not_in_train=False)
+    #     dataset_utils.load_fb_train_test(remove_nodes_not_in_train=True)
     # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
     #                           local_search_max_iter=500, local_search_n_cores=25,
-    #                           k_values_to_test=[3],
+    #                           k_values_to_test=[1, 2, 3],
     #                           plot_fitted_hist=False, verbose=False)
 
     # # Enron Dataset

diff --git a/chip_model_fitting.py b/chip_model_fitting.py
@@ -81,6 +81,7 @@ def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes
         print(f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ')
         print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s")
 
+
         if plot_fitted_hist:
             model_utils.generate_fit_community_hawkes(train_event_dict, train_node_membership,
                                                       train_bp_mu, train_bp_alpha, train_bp_beta,
@@ -96,21 +97,29 @@ def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes
 
 # Examples of fitting CHIP to Facebook, Enron, Reality Mining and simulated data.
 if __name__ == "__main__":
-
-    # Entire Facebook Dataset
-    print("Facebook wall-post dataset")
+    # # Entire Facebook Dataset
+    print("Entire Facebook wall-post dataset 2")
     fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
-        dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8)
+        dataset_utils.load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=True,
+                                           train_percentage=0.8, remove_nodes_not_in_train=True)
     fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
-                                  k_values_to_test=[20, 50, 100, 200, 500],
+                                  k_values_to_test=np.arange(51, 101),
                                   plot_fitted_hist=False, verbose=False)
 
+    # # # Entire Facebook Dataset
+    # print("Entire Facebook wall-post dataset")
+    # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
+    #     dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8)
+    # fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
+    #                               k_values_to_test=[9],
+    #                               plot_fitted_hist=False, verbose=False)
+
     # # Facebook Dataset
     # print("Facebook wall-post dataset")
     # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
     #     dataset_utils.load_fb_train_test(remove_nodes_not_in_train=False)
     # fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
-    #                               k_values_to_test=np.arange(1, 11),
+    #                               k_values_to_test=[6],
     #                               plot_fitted_hist=False, verbose=False)
 
     # # Enron Dataset

diff --git a/dataset_utils.py b/dataset_utils.py
@@ -420,9 +420,10 @@ def load_facebook_wall(timestamp_max=1000, largest_connected_component_only=Fals
         # Scale timestamps to 0 to timestamp_max
         data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max
 
+
     if train_percentage is not None:
         return split_event_list_to_train_test(data, train_percentage=train_percentage)
-    
+
     duration = data[-1, 2]
 
     node_set = set(data[:, 0].astype(np.int)).union(data[:, 1].astype(np.int))
@@ -441,6 +442,80 @@ def load_facebook_wall(timestamp_max=1000, largest_connected_component_only=Fals
     return event_dict, len(node_set), duration
 
 
+def load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=False, train_percentage=None,
+                         plot_growth=False, remove_nodes_not_in_train=False):
+    """
+    :param timestamp_max: The time unit of the last timestamp. Used to scale all other timestamps.
+    :param largest_connected_component_only: if True, only the largest connected component will be loaded.
+    :param train_percentage: If None, returns the entire dataset as a single dataset, else returns a train/test/combined
+                             dataset based on the train_percentage.
+    """
+
+    file_path = '/shared/DataSets/FacebookViswanath2009/raw/facebook-wall.txt'
+
+    # receiver_id sender_id unix_timestamp
+    data = np.loadtxt(file_path, np.float)
+
+    # remove data during the first half
+    data = data[data[:, 2].argsort()]
+    data[:, 2] = data[:, 2] - data[0, 2]
+    data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max
+
+    data = data[np.where(data[:, 2] >= 500)[0], :]
+    data = data[np.where(data[:, 2] <= 900)[0], :]
+
+    # remove self-edges
+    data = data[np.where(data[:, 0] != data[:, 1])[0], :]
+
+    if largest_connected_component_only:
+        # finding the nodes in the largest connected component
+        fb_net = nx.Graph()
+        for i in range(data.shape[0]):
+            fb_net.add_edge(data[i, 1], data[i, 0])
+
+        largest_cc = max(nx.connected_components(fb_net), key=len)
+        edge_idx_in_largest_cc = np.array([node_id in largest_cc for node_id in data[:, 0]])
+        data = data[edge_idx_in_largest_cc, :]
+
+    # Sorting by unix_timestamp and adjusting first timestamp to start from 0
+    data = data[data[:, 2].argsort()]
+    data[:, 2] = data[:, 2] - data[0, 2]
+
+    if timestamp_max is not None:
+        # Scale timestamps to 0 to timestamp_max
+        data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max
+
+    duration = data[-1, 2]
+
+    if plot_growth:
+        cum_event_count = [np.sum(data[:, 2] < t) for t in range(int(duration) + 1)]
+        plt.plot(np.arange(int(duration) + 1), cum_event_count)
+        plt.ylabel('Cumulative Event Count')
+        plt.xlabel('Duration')
+        # plt.savefig(f"/shared/Results/CommunityHawkes/pickles/full_fb_fit/plots/fb_growth.pdf")
+        plt.show()
+
+    if train_percentage is not None:
+        return split_event_list_to_train_test(data, train_percentage=train_percentage,
+                                              remove_nodes_not_in_train=remove_nodes_not_in_train)
+
+
+    node_set = set(data[:, 0].astype(np.int)).union(data[:, 1].astype(np.int))
+    node_id_map = get_node_map(node_set)
+
+    event_dict = {}
+    for i in range(data.shape[0]):
+        receiver_id = node_id_map[np.int(data[i, 0])]
+        sender_id = node_id_map[np.int(data[i, 1])]
+
+        if (sender_id, receiver_id) not in event_dict:
+            event_dict[(sender_id, receiver_id)] = []
+
+        event_dict[(sender_id, receiver_id)].append(data[i, 2])
+
+    return event_dict, len(node_set), duration
+
+
 # Various examples of loading datasets
 if __name__ == '__main__':
 

diff --git a/full_fb_wall_post_model_fitting.py b/full_fb_wall_post_model_fitting.py
@@ -26,6 +26,7 @@
 plot_hawkes_params = False
 plot_node_membership = False
 plot_num_events = False
+plot_community_structure = True
 simulate_chip = False
 get_confidence_intervals = False
 verbose = False
@@ -199,9 +200,9 @@
         labels = np.arange(1, num_classes + 1)
         im, _ = heatmap(hawkes_params[param], labels, labels, ax=ax, cmap="Greys", cbarlabel=cbar_label[param])
 
-        # ax.set_title(f"Full Facebook wall-posts {param.capitalize()}")
+        ax.set_title(f"Full Facebook wall-posts {param.capitalize()}")
         fig.tight_layout()
-        plt.savefig(f"{result_file_path}/plots/{param}-k-{num_classes}.pdf")
+        # plt.savefig(f"{result_file_path}/plots/{param}-k-{num_classes}.pdf")
         plt.show()
 
     # plot m
@@ -332,5 +333,35 @@
 # Total computation time: 168.6s
 
 
-
-
+if plot_community_structure:
+    # adj = utils.event_dict_to_adjacency(fb_num_node, fb_event_dict, dtype=np.int)
+    num_nodes = len(node_membership)
+    community_membership = utils.node_membership_to_community_membership(node_membership, num_classes)
+    community_size = [len(community) for community in community_membership]
+    node_ids = np.concatenate(community_membership)
+    sorting_map = {}
+    for i in range(node_ids.shape[0]):
+        sorting_map[node_ids[i]] = i
+
+    sorted_adj = np.zeros((num_nodes, num_nodes), dtype=np.int)
+
+    for (u, v), event_times in fb_event_dict.items():
+        if len(event_times) != 0:
+            sorted_adj[sorting_map[u], sorting_map[v]] = 1
+
+    #Plot adjacency matrix in toned-down black and white
+    plt.spy(sorted_adj, marker='.', markersize=0.1, precision=0)
+    cumulative_community_size = 0
+    for com_size in community_size:
+        cumulative_community_size += com_size
+        plt.axhline(cumulative_community_size, color='black', linewidth=1)
+        plt.axvline(cumulative_community_size, color='black', linewidth=1)
+
+    # plt.xticks(rotation=45)
+    ticks = np.arange(0, num_nodes, 5000)
+    plt.yticks(ticks, [f'{int(t / 1000)}{"K" if t >= 1000 else ""}' for t in ticks], fontsize=13)
+    plt.xticks(ticks, [f'{int(t / 1000)}{"K" if t >= 1000 else ""}' for t in ticks], fontsize=13)
+    plt.tight_layout()
+    # plt.show()
+    plt.savefig(f"{result_file_path}/plots/community-structure-k-{num_classes}.png", format='png', dpi=200)
+    # plt.savefig(f"{result_file_path}/plots/community-structure-k-{num_classes}.pdf", format='pdf')
diff --git a/full_fb_wall_post_prediction.py b/full_fb_wall_post_prediction.py
@@ -48,7 +48,6 @@
 print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration, "Num Edges:", train_num_events)
 print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration, "Num Edges:", test_num_events)
 
-
 # fit Facebook Wall-posts
 if fit_chip:
     tic = time.time()