diff --git a/bhm_model_fitting.py b/bhm_model_fitting.py index 3e64ecd..15ecd16 100644 --- a/bhm_model_fitting.py +++ b/bhm_model_fitting.py @@ -105,22 +105,33 @@ def fit_and_eval_block_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not # Running Block Hawkes model on Facebook, Enron, Reality Mining, and simulated data if __name__ == "__main__": - # Entire Facebook Dataset - print("Entire Facebook wall-post dataset") - fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ - dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8) - fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, - local_search_max_iter=500, local_search_n_cores=25, - k_values_to_test=[1], - plot_fitted_hist=False, verbose=False) + pass + # # Entire Facebook Dataset + # print("Entire Facebook wall-post dataset 2") + # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ + # dataset_utils.load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=True, + # train_percentage=0.8, remove_nodes_not_in_train=True) + # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, + # local_search_max_iter=500, local_search_n_cores=25, + # k_values_to_test=[1], + # plot_fitted_hist=False, verbose=False) + + # # Entire Facebook Dataset + # print("Facebook wall-post dataset") + # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ + # dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8) + # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, + # local_search_max_iter=500, local_search_n_cores=25, + # k_values_to_test=[1], + # plot_fitted_hist=False, verbose=False) # # Facebook Dataset # print("Facebook wall-post dataset") # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ - # dataset_utils.load_fb_train_test(remove_nodes_not_in_train=False) + # dataset_utils.load_fb_train_test(remove_nodes_not_in_train=True) # fit_and_eval_block_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, # local_search_max_iter=500, local_search_n_cores=25, - # k_values_to_test=[3], + # k_values_to_test=[1, 2, 3], # plot_fitted_hist=False, verbose=False) # # Enron Dataset diff --git a/chip_model_fitting.py b/chip_model_fitting.py index a93cc81..07174cb 100644 --- a/chip_model_fitting.py +++ b/chip_model_fitting.py @@ -81,6 +81,7 @@ def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes print(f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ') print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s") + if plot_fitted_hist: model_utils.generate_fit_community_hawkes(train_event_dict, train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, @@ -96,21 +97,29 @@ def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes # Examples of fitting CHIP to Facebook, Enron, Reality Mining and simulated data. if __name__ == "__main__": - - # Entire Facebook Dataset - print("Facebook wall-post dataset") + # # Entire Facebook Dataset + print("Entire Facebook wall-post dataset 2") fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ - dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8) + dataset_utils.load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=True, + train_percentage=0.8, remove_nodes_not_in_train=True) fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, - k_values_to_test=[20, 50, 100, 200, 500], + k_values_to_test=np.arange(51, 101), plot_fitted_hist=False, verbose=False) + # # # Entire Facebook Dataset + # print("Entire Facebook wall-post dataset") + # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ + # dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8) + # fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, + # k_values_to_test=[9], + # plot_fitted_hist=False, verbose=False) + # # Facebook Dataset # print("Facebook wall-post dataset") # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ # dataset_utils.load_fb_train_test(remove_nodes_not_in_train=False) # fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, - # k_values_to_test=np.arange(1, 11), + # k_values_to_test=[6], # plot_fitted_hist=False, verbose=False) # # Enron Dataset diff --git a/dataset_utils.py b/dataset_utils.py index 34be3c1..f57dc26 100644 --- a/dataset_utils.py +++ b/dataset_utils.py @@ -420,9 +420,10 @@ def load_facebook_wall(timestamp_max=1000, largest_connected_component_only=Fals # Scale timestamps to 0 to timestamp_max data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max + if train_percentage is not None: return split_event_list_to_train_test(data, train_percentage=train_percentage) - + duration = data[-1, 2] node_set = set(data[:, 0].astype(np.int)).union(data[:, 1].astype(np.int)) @@ -441,6 +442,80 @@ def load_facebook_wall(timestamp_max=1000, largest_connected_component_only=Fals return event_dict, len(node_set), duration +def load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=False, train_percentage=None, + plot_growth=False, remove_nodes_not_in_train=False): + """ + :param timestamp_max: The time unit of the last timestamp. Used to scale all other timestamps. + :param largest_connected_component_only: if True, only the largest connected component will be loaded. + :param train_percentage: If None, returns the entire dataset as a single dataset, else returns a train/test/combined + dataset based on the train_percentage. + """ + + file_path = '/shared/DataSets/FacebookViswanath2009/raw/facebook-wall.txt' + + # receiver_id sender_id unix_timestamp + data = np.loadtxt(file_path, np.float) + + # remove data during the first half + data = data[data[:, 2].argsort()] + data[:, 2] = data[:, 2] - data[0, 2] + data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max + + data = data[np.where(data[:, 2] >= 500)[0], :] + data = data[np.where(data[:, 2] <= 900)[0], :] + + # remove self-edges + data = data[np.where(data[:, 0] != data[:, 1])[0], :] + + if largest_connected_component_only: + # finding the nodes in the largest connected component + fb_net = nx.Graph() + for i in range(data.shape[0]): + fb_net.add_edge(data[i, 1], data[i, 0]) + + largest_cc = max(nx.connected_components(fb_net), key=len) + edge_idx_in_largest_cc = np.array([node_id in largest_cc for node_id in data[:, 0]]) + data = data[edge_idx_in_largest_cc, :] + + # Sorting by unix_timestamp and adjusting first timestamp to start from 0 + data = data[data[:, 2].argsort()] + data[:, 2] = data[:, 2] - data[0, 2] + + if timestamp_max is not None: + # Scale timestamps to 0 to timestamp_max + data[:, 2] = (data[:, 2] - min(data[:, 2])) / (max(data[:, 2]) - min(data[:, 2])) * timestamp_max + + duration = data[-1, 2] + + if plot_growth: + cum_event_count = [np.sum(data[:, 2] < t) for t in range(int(duration) + 1)] + plt.plot(np.arange(int(duration) + 1), cum_event_count) + plt.ylabel('Cumulative Event Count') + plt.xlabel('Duration') + # plt.savefig(f"/shared/Results/CommunityHawkes/pickles/full_fb_fit/plots/fb_growth.pdf") + plt.show() + + if train_percentage is not None: + return split_event_list_to_train_test(data, train_percentage=train_percentage, + remove_nodes_not_in_train=remove_nodes_not_in_train) + + + node_set = set(data[:, 0].astype(np.int)).union(data[:, 1].astype(np.int)) + node_id_map = get_node_map(node_set) + + event_dict = {} + for i in range(data.shape[0]): + receiver_id = node_id_map[np.int(data[i, 0])] + sender_id = node_id_map[np.int(data[i, 1])] + + if (sender_id, receiver_id) not in event_dict: + event_dict[(sender_id, receiver_id)] = [] + + event_dict[(sender_id, receiver_id)].append(data[i, 2]) + + return event_dict, len(node_set), duration + + # Various examples of loading datasets if __name__ == '__main__': diff --git a/full_fb_wall_post_model_fitting.py b/full_fb_wall_post_model_fitting.py index 42e39d2..7b3eb14 100644 --- a/full_fb_wall_post_model_fitting.py +++ b/full_fb_wall_post_model_fitting.py @@ -26,6 +26,7 @@ plot_hawkes_params = False plot_node_membership = False plot_num_events = False +plot_community_structure = True simulate_chip = False get_confidence_intervals = False verbose = False @@ -199,9 +200,9 @@ labels = np.arange(1, num_classes + 1) im, _ = heatmap(hawkes_params[param], labels, labels, ax=ax, cmap="Greys", cbarlabel=cbar_label[param]) - # ax.set_title(f"Full Facebook wall-posts {param.capitalize()}") + ax.set_title(f"Full Facebook wall-posts {param.capitalize()}") fig.tight_layout() - plt.savefig(f"{result_file_path}/plots/{param}-k-{num_classes}.pdf") + # plt.savefig(f"{result_file_path}/plots/{param}-k-{num_classes}.pdf") plt.show() # plot m @@ -332,5 +333,35 @@ # Total computation time: 168.6s - - +if plot_community_structure: + # adj = utils.event_dict_to_adjacency(fb_num_node, fb_event_dict, dtype=np.int) + num_nodes = len(node_membership) + community_membership = utils.node_membership_to_community_membership(node_membership, num_classes) + community_size = [len(community) for community in community_membership] + node_ids = np.concatenate(community_membership) + sorting_map = {} + for i in range(node_ids.shape[0]): + sorting_map[node_ids[i]] = i + + sorted_adj = np.zeros((num_nodes, num_nodes), dtype=np.int) + + for (u, v), event_times in fb_event_dict.items(): + if len(event_times) != 0: + sorted_adj[sorting_map[u], sorting_map[v]] = 1 + + #Plot adjacency matrix in toned-down black and white + plt.spy(sorted_adj, marker='.', markersize=0.1, precision=0) + cumulative_community_size = 0 + for com_size in community_size: + cumulative_community_size += com_size + plt.axhline(cumulative_community_size, color='black', linewidth=1) + plt.axvline(cumulative_community_size, color='black', linewidth=1) + + # plt.xticks(rotation=45) + ticks = np.arange(0, num_nodes, 5000) + plt.yticks(ticks, [f'{int(t / 1000)}{"K" if t >= 1000 else ""}' for t in ticks], fontsize=13) + plt.xticks(ticks, [f'{int(t / 1000)}{"K" if t >= 1000 else ""}' for t in ticks], fontsize=13) + plt.tight_layout() + # plt.show() + plt.savefig(f"{result_file_path}/plots/community-structure-k-{num_classes}.png", format='png', dpi=200) + # plt.savefig(f"{result_file_path}/plots/community-structure-k-{num_classes}.pdf", format='pdf') diff --git a/full_fb_wall_post_prediction.py b/full_fb_wall_post_prediction.py index a6838ec..c3a3932 100644 --- a/full_fb_wall_post_prediction.py +++ b/full_fb_wall_post_prediction.py @@ -48,7 +48,6 @@ print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration, "Num Edges:", train_num_events) print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration, "Num Edges:", test_num_events) - # fit Facebook Wall-posts if fit_chip: tic = time.time() diff --git a/full_fb_wall_post_prediction_2.py b/full_fb_wall_post_prediction_2.py new file mode 100644 index 0000000..9ac8fbb --- /dev/null +++ b/full_fb_wall_post_prediction_2.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +""" +"Exploratory Analysis of the Facebook Wall Posts Dataset using CHIP" + +Here we fit CHIP to the largest connected component of the Facebook wall-post dataset in a train/test setting to +evaluate the log-likelihood of the model on the test set. + +@author: Makan Arastuie +""" + +import time +import pickle +import numpy as np +import dataset_utils +from scipy.stats import norm +import matplotlib.pyplot as plt +from plotting_utils import heatmap +import generative_model_utils as utils +import model_fitting_utils as fitting_utils +import parameter_estimation as estimate_utils +from spectral_clustering import spectral_cluster + + +result_file_path = '/shared/Results/CommunityHawkes/pickles/full_fb_fit' + +fit_chip = True +verbose = False +get_predictions = True +num_classes = 9 + +tic = time.time() +((train_event_dict, train_num_nodes, train_duration), + (test_event_dict, test_num_nodes, test_duration), + (combined_event_dict, combined_num_events, combined_duration), + nodes_not_in_train) = dataset_utils.load_facebook_wall_2(largest_connected_component_only=True, train_percentage=0.8, + plot_growth=False, remove_nodes_not_in_train=True) + +# ((train_event_dict, train_num_nodes, train_duration), +# (test_event_dict, test_num_nodes, test_duration), +# (combined_event_dict, combined_num_events, combined_duration), +# nodes_not_in_train) = dataset_utils.load_facebook_wall(largest_connected_component_only=True, train_percentage=0.8) + +print(train_num_nodes + len(nodes_not_in_train)) + +toc = time.time() +print(f"Loaded the dataset in {toc - tic:.1f}s") + +train_num_events = utils.num_events_in_event_dict(train_event_dict) +test_num_events = utils.num_events_in_event_dict(test_event_dict) + +print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration, "Num Edges:", train_num_events) +print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration, "Num Edges:", test_num_events) +print("Total: ", "Num Nodes:", train_num_nodes + len(nodes_not_in_train), + "Num Edges:", train_num_events + test_num_events) + +# fit Facebook Wall-posts +if fit_chip: + tic = time.time() + train_agg_adj = utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict) + toc = time.time() + + if verbose: + print(f"Generated aggregated adj in {toc - tic:.1f}s") + + tic_tot = time.time() + tic = time.time() + # Running spectral clustering + train_node_membership = spectral_cluster(train_agg_adj, num_classes=num_classes, verbose=False, + plot_eigenvalues=False, plot_save_path=result_file_path+'/plots') + toc = time.time() + + print(f"Spectral clustering done in {toc - tic:.1f}s") + + if verbose: + print("Community assignment prob:", np.unique(train_node_membership, return_counts=True)[1] / train_num_nodes) + + + tic = time.time() + bp_mu, bp_alpha, bp_beta, bp_alpha_beta_ratio = fitting_utils.estimate_bp_hawkes_params(train_event_dict, + train_node_membership, + train_duration, + num_classes) + toc = time.time() + + print(f"Hawkes params estimated in {toc - tic:.1f}s") + + if verbose: + print("Mu:") + print(bp_mu) + + print("Ratio:") + print(bp_alpha_beta_ratio) + + print("Alpha:") + print(bp_alpha) + + print("Beta:") + print(bp_beta) + + train_hawkes_params = {'mu': bp_mu, + 'alpha_beta_ratio': bp_alpha_beta_ratio, + 'alpha': bp_alpha, + 'beta': bp_beta} + + # Save results + with open(f'{result_file_path}/pred-all-model-params-k-{num_classes}.pckl', 'wb') as handle: + pickle.dump([train_num_nodes, train_num_events, train_duration, + train_node_membership, + train_hawkes_params], handle, protocol=pickle.HIGHEST_PROTOCOL) +else: + with open(f'{result_file_path}/pred-all-model-params-k-{num_classes}.pckl', 'rb') as handle: + [train_num_nodes, train_num_events, train_duration, + train_node_membership, + train_hawkes_params] = pickle.load(handle) + + +### calculate test log-likelihood +combined_node_membership = fitting_utils.assign_node_membership_for_missing_nodes(train_node_membership, + nodes_not_in_train) + +# # Calculate log-likelihood given the entire dataset +# combined_block_pair_events = utils.event_dict_to_block_pair_events(combined_event_dict, +# combined_node_membership, +# num_classes) +# +# combined_log_likelihood = fitting_utils.calc_full_log_likelihood(combined_block_pair_events, +# combined_node_membership, +# train_hawkes_params['mu'], +# train_hawkes_params['alpha'], +# train_hawkes_params['beta'], +# combined_duration, num_classes) +# +# # Calculate log-likelihood given the train dataset +# train_block_pair_events = utils.event_dict_to_block_pair_events(train_event_dict, train_node_membership, num_classes) +# train_log_likelihood = fitting_utils.calc_full_log_likelihood(train_block_pair_events, train_node_membership, +# train_hawkes_params['mu'], +# train_hawkes_params['alpha'], +# train_hawkes_params['beta'], +# train_duration, num_classes) +# +# # Calculate per event log likelihood +# ll_per_event = fitting_utils.calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood, +# test_event_dict, test_num_nodes) +# +# print("Test log_likelihood:", ll_per_event) + + +if get_predictions: + # Calculate mean and variance for block-pair events counts in test dataset + test_block_pair_events = utils.event_dict_to_block_pair_events(test_event_dict, + combined_node_membership, + num_classes) + + test_event_count_mean, test_event_count_variance = \ + fitting_utils.compute_block_pair_event_count_empirical_mean_and_variance(test_block_pair_events, + combined_node_membership, + num_classes) + + prediction_sample_mean, prediction_sample_var, test_block_pair_event_count = \ + fitting_utils.compute_prediction_mean_and_variance_for_block_pair_event_count(train_hawkes_params['mu'], + train_hawkes_params['alpha_beta_ratio'], + test_block_pair_events, + train_node_membership, + num_classes, train_duration, + test_duration) + + print("Sample mean:") + print(prediction_sample_mean) + print(np.mean(prediction_sample_mean)) + + print("Sample var:") + print(prediction_sample_var) + + print("Event count:") + print(test_block_pair_event_count) + print(np.mean(test_block_pair_event_count)) + + # Save results + with open(f'{result_file_path}/predictions-k-{num_classes}.pckl', 'wb') as handle: + pickle.dump([test_block_pair_events, test_event_count_mean, test_event_count_variance, + prediction_sample_mean, prediction_sample_var, + test_block_pair_event_count], handle, protocol=pickle.HIGHEST_PROTOCOL) +else: + with open(f'{result_file_path}/predictions-k-{num_classes}.pckl', 'rb') as handle: + [test_block_pair_events, test_event_count_mean, test_event_count_variance, + prediction_sample_mean, prediction_sample_var, + test_block_pair_event_count] = pickle.load(handle) + + +# print("Sample mean:") +# print(prediction_sample_mean) +# print(np.mean(prediction_sample_mean)) +# +# print("Sample var:") +# print(prediction_sample_var) +# +# print("Event count:") +# print(test_block_pair_event_count) +# print(np.mean(test_block_pair_event_count)) + +percentage_within = [] +for i in np.arange(0, 1.01, 0.01): + lower_ci, upper_ci = norm.interval(i, loc=prediction_sample_mean, scale=np.sqrt(prediction_sample_var)) + + # lower_ci, upper_ci = norm.interval(1 - (1 - i) / (num_classes ** 2), loc=prediction_sample_mean, + # scale=np.sqrt(prediction_sample_var)) + block_pairs_within_interval = np.logical_and(test_block_pair_event_count >= lower_ci, + test_block_pair_event_count <= upper_ci) + + # print(i, block_pairs_within_interval) + + percentage_within.append(np.sum(block_pairs_within_interval) / (num_classes ** 2)) + +# print(test_block_pair_event_count) +plt.plot(np.arange(0, 1.01, 0.01), percentage_within) +plt.ylabel("Percentage of Block-pair Event Count Within CI") +plt.xlabel("Width of Confidence Interval (CI)") +plt.ylim((0, 1)) +plt.show() +plt.savefig(f'{result_file_path}/plots/prediction-interval.pdf', format='pdf') diff --git a/full_fb_wall_post_prediction_poisson.py b/full_fb_wall_post_prediction_poisson.py new file mode 100644 index 0000000..09cc93e --- /dev/null +++ b/full_fb_wall_post_prediction_poisson.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +""" +"Exploratory Analysis of the Facebook Wall Posts Dataset using CHIP" + +Here we fit CHIP to the largest connected component of the Facebook wall-post dataset in a train/test setting to +evaluate the log-likelihood of the model on the test set. + +@author: Makan Arastuie +""" + +import time +import pickle +import numpy as np +import dataset_utils +from scipy.stats import norm +import matplotlib.pyplot as plt +from plotting_utils import heatmap +import generative_model_utils as utils +import model_fitting_utils as fitting_utils +import parameter_estimation as estimate_utils +from spectral_clustering import spectral_cluster +import poisson_baseline_model_fitting as poisson_fitting + + +result_file_path = '/shared/Results/CommunityHawkes/pickles/full_fb_fit/poisson' + +fit_chip = True +verbose = False +get_predictions = True +num_classes = 9 + +tic = time.time() +((train_event_dict, train_num_nodes, train_duration), + (test_event_dict, test_num_nodes, test_duration), + (combined_event_dict, combined_num_events, combined_duration), + nodes_not_in_train) = dataset_utils.load_facebook_wall_2(largest_connected_component_only=True, train_percentage=0.8, + plot_growth=False, remove_nodes_not_in_train=True) + +print(train_num_nodes + len(nodes_not_in_train)) + +toc = time.time() +print(f"Loaded the dataset in {toc - tic:.1f}s") + +train_num_events = utils.num_events_in_event_dict(train_event_dict) +test_num_events = utils.num_events_in_event_dict(test_event_dict) + +print("Train: ", "Num Nodes:", train_num_nodes, "Duration:", train_duration, "Num Edges:", train_num_events) +print("Test: ", "Num Nodes:", test_num_nodes, "Duration:", test_duration, "Num Edges:", test_num_events) +print("Total: ", "Num Nodes:", train_num_nodes + len(nodes_not_in_train), + "Num Edges:", train_num_events + test_num_events) + +# fit Facebook Wall-posts +if fit_chip: + tic = time.time() + train_agg_adj = utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict) + toc = time.time() + + if verbose: + print(f"Generated aggregated adj in {toc - tic:.1f}s") + + tic_tot = time.time() + tic = time.time() + # Running spectral clustering + train_node_membership = spectral_cluster(train_agg_adj, num_classes=num_classes, verbose=False, + plot_eigenvalues=False, plot_save_path=result_file_path+'/plots') + toc = time.time() + + print(f"Spectral clustering done in {toc - tic:.1f}s") + + if verbose: + print("Community assignment prob:", np.unique(train_node_membership, return_counts=True)[1] / train_num_nodes) + + tic = time.time() + # Fitting the model to the train data + train_node_membership, train_bp_lambda, train_block_count_matrix = \ + poisson_fitting.fit_poisson_baseline_model(train_event_dict, train_num_nodes, + train_duration, num_classes, verbose=verbose) + + toc = time.time() + + print(f"Poisson param estimated in {toc - tic:.1f}s") + + if verbose: + print("Lambda:") + print(train_bp_lambda) + + # Save results + with open(f'{result_file_path}/pred-param-k-{num_classes}.pckl', 'wb') as handle: + pickle.dump([train_num_nodes, train_num_events, train_duration, + train_node_membership, train_block_count_matrix, + train_bp_lambda], handle, protocol=pickle.HIGHEST_PROTOCOL) +else: + with open(f'{result_file_path}/pred-param-k-{num_classes}.pckl', 'rb') as handle: + [train_num_nodes, train_num_events, train_duration, + train_node_membership, train_block_count_matrix, + train_bp_lambda] = pickle.load(handle) + + +### calculate test log-likelihood +# Add nodes that were not in train to the largest block +combined_node_membership = fitting_utils.assign_node_membership_for_missing_nodes(train_node_membership, + nodes_not_in_train) + +# Calculate log-likelihood given the entire dataset +combined_count_matrix = poisson_fitting.event_dict_to_block_pair_event_counts(combined_event_dict, + combined_node_membership, + num_classes) + +combined_log_likelihood = poisson_fitting.calc_full_log_likelihood(combined_count_matrix, combined_node_membership, + combined_duration, train_bp_lambda, num_classes) + +# Calculate log-likelihood given the train dataset +train_log_likelihood = poisson_fitting.calc_full_log_likelihood(train_block_count_matrix, train_node_membership, + test_duration, train_bp_lambda, num_classes) + +# Calculate per event log likelihood +ll_per_event = fitting_utils.calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood, + test_event_dict, test_num_nodes) + +print("Test log_likelihood:", ll_per_event) + + + +if get_predictions: + test_block_pair_events = utils.event_dict_to_block_pair_events(test_event_dict, + combined_node_membership, + num_classes) + test_block_pair_event_count = fitting_utils.compute_block_pair_total_event_count(test_block_pair_events, + num_classes) + + train_bp_size = utils.calc_block_pair_size(train_node_membership, num_classes) + prediction_sample_mean = train_bp_lambda * test_duration * train_bp_size + + print("Sample mean and variance:") + print(prediction_sample_mean) + print("mean: ", np.mean(prediction_sample_mean)) + + print("Event count:") + print(test_block_pair_event_count) + print(np.mean(test_block_pair_event_count)) + + # Save results + with open(f'{result_file_path}/predictions-k-{num_classes}.pckl', 'wb') as handle: + pickle.dump([test_block_pair_events, test_block_pair_event_count, + prediction_sample_mean], handle, protocol=pickle.HIGHEST_PROTOCOL) +else: + with open(f'{result_file_path}/predictions-k-{num_classes}.pckl', 'rb') as handle: + [test_block_pair_events, test_block_pair_event_count, + prediction_sample_mean] = pickle.load(handle) + + +percentage_within = [] +for i in np.arange(0, 1.01, 0.01): + # lower_ci, upper_ci = norm.interval(i, loc=prediction_sample_mean, scale=np.sqrt(prediction_sample_mean)) + + lower_ci, upper_ci = norm.interval(1 - (1 - i) / (num_classes ** 2), loc=prediction_sample_mean, + scale=np.sqrt(prediction_sample_mean)) + block_pairs_within_interval = np.logical_and(test_block_pair_event_count >= lower_ci, + test_block_pair_event_count <= upper_ci) + + percentage_within.append(np.sum(block_pairs_within_interval) / (num_classes ** 2)) + +plt.plot(np.arange(0, 1.01, 0.01), percentage_within) +plt.ylabel("Percentage of Block-pair Event Count Within CI") +plt.xlabel("Width of Confidence Interval (CI)") +plt.ylim((0, 1)) +plt.savefig(f'{result_file_path}/plots/prediction-interval.pdf', format='pdf') +plt.show() diff --git a/generative_model_utils.py b/generative_model_utils.py index 6561f15..39ad15f 100644 --- a/generative_model_utils.py +++ b/generative_model_utils.py @@ -412,3 +412,6 @@ def simulate_community_hawkes(params=None, network_name=None, load_if_exists=Fal return event_dict, node_membership + +def compute_hawkes_conditional_intensity(): + pass diff --git a/model_fitting_utils.py b/model_fitting_utils.py index 5e9420e..d02fbf6 100644 --- a/model_fitting_utils.py +++ b/model_fitting_utils.py @@ -35,7 +35,7 @@ def fit_community_model(event_dict, num_nodes, duration, num_classes, local_sear # adj = utils.event_dict_to_adjacency(num_nodes, event_dict) # Running spectral clustering - node_membership = spectral_cluster(agg_adj, num_classes, verbose=False) + node_membership = spectral_cluster(agg_adj, num_classes, verbose=False, plot_eigenvalues=False) if local_search_max_iter > 0 and num_classes > 1: node_membership, bp_mu, bp_alpha, bp_beta = cls.chip_local_search(event_dict, num_classes, node_membership, @@ -326,3 +326,79 @@ def compute_mu_pairwise_difference_confidence_interval(event_dict, node_membersh pairwise_res_dict[(a, b, x, y)] = (diff, ci) return pairwise_res_dict + + +def compute_block_pair_event_count_empirical_mean_and_variance(block_pair_events, node_membership, n_classes): + """ + Computes the mean and variance of block pair event counts + + :param block_pair_events: (list) n_classes x n_classes where entry ij is a list of event lists between nodes in + block i to nodes in block j. + :param node_membership: (list) membership of every node to one of K classes. + :param n_classes: (int) number of blocks / classes + + :return: a tuple of two matrices of KxK for mean and variance of block pair event counts + """ + + bp_size = utils.calc_block_pair_size(node_membership, n_classes).astype(int) + + block_pair_events_counts_mean = np.zeros((n_classes, n_classes)) + block_pair_events_counts_variance = np.zeros((n_classes, n_classes)) + for i in range(n_classes): + for j in range(n_classes): + temp_counts = [len(event_list) for event_list in block_pair_events[i][j]] # actual counts + temp_counts.extend([0] * (bp_size[i, j] - len(temp_counts))) # add 0's for node-pairs with no events + + block_pair_events_counts_mean[i, j] = np.mean(temp_counts) + block_pair_events_counts_variance[i, j] = np.std(temp_counts) ** 2 + + return block_pair_events_counts_mean, block_pair_events_counts_variance + + +def compute_block_pair_total_event_count(block_pair_events, n_classes): + """ + Computes the total number of events in each block pair. + + :param block_pair_events: (list) n_classes x n_classes where entry ij is a list of event lists between nodes in + block i to nodes in block j. + :param n_classes: (int) number of blocks / classes + + :return: n_classes x n_classes matrix where entry ij is the number of events in block pair ij + """ + + block_pair_event_count = np.zeros((n_classes, n_classes)) + + for i in range(n_classes): + for j in range(n_classes): + block_pair_event_count[i, j] = np.sum([len(event_list) for event_list in block_pair_events[i][j]]) + + return block_pair_event_count + + +def compute_prediction_mean_and_variance_for_block_pair_event_count(train_bp_mu, train_bp_alpha_beta_ratio, + test_block_pair_events, + train_node_membership, n_classes, + train_duration, test_duration): + """ + Computes sample mean and variance of block pair event counts + + :param test_block_pair_events: (list) n_classes x n_classes where entry ij is a list of event lists between nodes in + block i to nodes in block j of the test dataset. + :param train_node_membership: (list) membership of every node to one of K classes in the train dataset. + :param n_classes: (int) number of blocks / classes + :param train_duration: duration of the train dataset + :param test_duration: duration of the test dataset + + :return: + """ + train_bp_size = utils.calc_block_pair_size(train_node_membership, n_classes) + + sample_mean = (train_bp_mu * train_duration) / (1 - train_bp_alpha_beta_ratio) + sample_mean = (sample_mean / train_duration) * test_duration * train_bp_size + + sample_var = (train_bp_mu * train_duration) / ((1 - train_bp_alpha_beta_ratio) ** 3) + sample_var = (sample_var / train_duration) * test_duration * train_bp_size + + test_block_pair_event_count = compute_block_pair_total_event_count(test_block_pair_events, n_classes) + + return sample_mean, sample_var, test_block_pair_event_count diff --git a/poisson_baseline_model_fitting.py b/poisson_baseline_model_fitting.py index 12a702a..5b7a811 100644 --- a/poisson_baseline_model_fitting.py +++ b/poisson_baseline_model_fitting.py @@ -96,14 +96,15 @@ def fit_poisson_baseline_model(event_dict, num_nodes, duration, num_classes, ver :return: node_membership, lambda, block_pair_events """ - adj = utils.event_dict_to_adjacency(num_nodes, event_dict) + # adj = utils.event_dict_to_adjacency(num_nodes, event_dict) + agg_adj = utils.event_dict_to_aggregated_adjacency(num_nodes, event_dict) # if number of there are as many classes as nodes, assign each node to its own class if num_classes == num_nodes: node_membership = list(range(num_nodes)) else: # Running spectral clustering - node_membership = spectral_cluster(adj, num_classes=num_classes) + node_membership = spectral_cluster(agg_adj, num_classes=num_classes) count_matrix = event_dict_to_block_pair_event_counts(event_dict, node_membership, num_classes) @@ -194,12 +195,21 @@ def calc_full_log_likelihood(count_matrix, node_membership, duration, bp_lambda, # Running Poisson baseline model on Facebook, Enron, Reality Mining if __name__ == "__main__": + # # # Entire Facebook Dataset + # print("Entire Facebook wall-post dataset 2") + # fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ + # dataset_utils.load_facebook_wall_2(timestamp_max=1000, largest_connected_component_only=True, + # train_percentage=0.8, remove_nodes_not_in_train=True) + # fit_and_eval_poisson_baseline(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, + # k_values_to_test=np.arange(101, 201), verbose=False) + # Entire Facebook Dataset print("Facebook wall-post dataset") fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \ - dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, train_percentage=0.8) + dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True, + train_percentage=0.8) fit_and_eval_poisson_baseline(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, - k_values_to_test=np.arange(11, 43), verbose=False) + k_values_to_test=np.arange(1, 11), verbose=False) # # Facebook Dataset # print("Facebook wall-post dataset") @@ -207,7 +217,8 @@ def calc_full_log_likelihood(count_matrix, node_membership, duration, bp_lambda, # dataset_utils.load_fb_train_test(remove_nodes_not_in_train=False) # fit_and_eval_poisson_baseline(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train, # k_values_to_test=np.arange(1, 150), verbose=False) - # + + # # Enron Dataset # print("Enron dataset") # enron_train_tuple, enron_test_tuple, enron_combined_tuple, enron_nodes_not_in_train = \ diff --git a/sim_end_to_end_count_based_estimates.py b/sim_end_to_end_count_based_estimates.py index 637b70d..d9108b3 100644 --- a/sim_end_to_end_count_based_estimates.py +++ b/sim_end_to_end_count_based_estimates.py @@ -27,7 +27,7 @@ Path(join(result_file_path, 'plots')).mkdir(parents=True, exist_ok=True) run_analysis = False -run_plotting = True +run_plotting = False run_regression = True # # sim params @@ -297,10 +297,14 @@ def calc_mean_and_error_of_count_estiamte(n_nodes): if run_regression: print("\nRegression: \n") print("Estimated Communities:\n") + + start_idx = 0 + end_idx = 3 + num_nodes = num_nodes_to_test[start_idx:end_idx] for param, err in params.items(): print(param, '(estimated communities)') - x = np.log(num_nodes_to_test).reshape(len(num_nodes_to_test), 1) - y = np.log(ece[err[0]]).reshape(len(ece[err[0]]), 1) + x = np.log(num_nodes).reshape(len(num_nodes), 1) + y = np.log(ece[err[0]][start_idx:end_idx]).reshape(len(ece[err[0]][start_idx:end_idx]), 1) reg = LinearRegression().fit(x, y) @@ -310,8 +314,8 @@ def calc_mean_and_error_of_count_estiamte(n_nodes): print() print(param, '(known communities)') - x = np.log(num_nodes_to_test).reshape(len(num_nodes_to_test), 1) - y = np.log(kce[err[0]]).reshape(len(kce[err[0]]), 1) + x = np.log(num_nodes).reshape(len(num_nodes), 1) + y = np.log(kce[err[0]][start_idx:end_idx]).reshape(len(kce[err[0]][start_idx:end_idx]), 1) reg = LinearRegression().fit(x, y)