diff --git a/.gitignore b/.gitignore index 935f522..1556515 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,8 @@ data/ etc/ result/ save_model/ -vae_gen_data/ -gan_gen_data/ +gen_data_vae/ +gen_data_gan/ tensorboard/ ### Notebook ### diff --git a/run_timegan.py b/run_timegan.py index f49434a..604fc25 100644 --- a/run_timegan.py +++ b/run_timegan.py @@ -63,7 +63,8 @@ print('>>>> TRAINING COMPLETE!') if args.is_generate: gen_data=timegan_generator(model, args.num_generation, args) - np.save(f'./gan_gen_data/gen_data',gen_data) + np.save(f'./gen_data_gan/gen_data',gen_data) + print('>>>> GENERATION COMPLETE!') diff --git a/run_vrae.py b/run_vrae.py index fdf7203..9ffcab3 100644 --- a/run_vrae.py +++ b/run_vrae.py @@ -99,13 +99,13 @@ # save original data train_org = pd.DataFrame(TRAIN_DF if args.undo == True else TRAIN_SCALED, columns= cols) - train_org.to_csv(f'./vae_gen_data/train/original_{args.scale_type}_un_{args.undo}.csv') - print('>> SAVED TRAIN ORIGINAL Data!! (Loc: vae_gen_data)') + train_org.to_csv(f'./gen_data_vae/train/original_{args.scale_type}_un_{args.undo}.csv') + print('>> SAVED TRAIN ORIGINAL Data!! (Loc: gen_data_vae)') # save reconstructed data train_gen = pd.DataFrame(train_recon, columns= cols) - train_gen.to_csv(f'./vae_gen_data/train/VRAE_{args.scale_type}_un_{args.undo}_hidden_{args.hidden_layer_depth}_win_{args.sequence_length}_ep_{args.n_epochs}.csv') - print('>> SAVED TRAIN RECONSTRUCTED Data!! (Loc: vae_gen_data)') + train_gen.to_csv(f'./gen_data_vae/train/VRAE_{args.scale_type}_un_{args.undo}_hidden_{args.hidden_layer_depth}_win_{args.sequence_length}_ep_{args.n_epochs}.csv') + print('>> SAVED TRAIN RECONSTRUCTED Data!! (Loc: gen_data_vae)') # TEST dataset reconstruction if args.is_generate_test: @@ -132,13 +132,13 @@ # save original data test_org = pd.DataFrame(TRAIN_DF if args.undo == True else TRAIN_SCALED, columns= cols) - test_org.to_csv(f'./vae_gen_data/test/original_{args.scale_type}_un_{args.undo}.csv') - print('>> SAVED TEST ORIGINAL Data!! (Loc: vae_gen_data)') + test_org.to_csv(f'./gen_data_vae/test/original_{args.scale_type}_un_{args.undo}.csv') + print('>> SAVED TEST ORIGINAL Data!! (Loc: gen_data_vae)') # save reconstructed data test_gen = pd.DataFrame(test_recon, columns= cols) - test_gen.to_csv(f'./vae_gen_data/test/VRAE_{args.scale_type}_un_{args.undo}_hidden_{args.hidden_layer_depth}_win_{args.sequence_length}_ep_{args.n_epochs}.csv') - print('>> SAVED TEST RECONSTRUCTED Data!! (Loc: vae_gen_data)') + test_gen.to_csv(f'./gen_data_vae/test/VRAE_{args.scale_type}_un_{args.undo}_hidden_{args.hidden_layer_depth}_win_{args.sequence_length}_ep_{args.n_epochs}.csv') + print('>> SAVED TEST RECONSTRUCTED Data!! (Loc: gen_data_vae)') # IF Both TRAIN and TEST data reconstruction is conducted if args.is_generate_train and args.is_generate_test: diff --git a/utils/TSTR.py b/utils/TSTR.py new file mode 100644 index 0000000..70e5f0e --- /dev/null +++ b/utils/TSTR.py @@ -0,0 +1,155 @@ +import torch +from tqdm import tqdm, trange +import numpy as np +from sklearn.metrics import accuracy_score, mean_squared_error + +class GeneralRNN(torch.nn.Module): + r"""A general RNN model for time-series prediction + """ + + def __init__(self, args): + super(GeneralRNN, self).__init__() + self.model_type = args['model_type'] + + self.input_size = args['in_dim'] + self.hidden_size = args['h_dim'] + self.output_size = args['out_dim'] + self.num_layers = args['n_layers'] + self.dropout = args['dropout'] + self.bidirectional = args['bidirectional'] + + self.max_seq_len = args['max_seq_len'] + + self.rnn_module = self._get_rnn_module(self.model_type) + + self.rnn_layer = self.rnn_module( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + batch_first=True, + dropout=self.dropout, + bidirectional=self.bidirectional + ) + + self.linear_layer = torch.nn.Linear( + in_features=self.hidden_size, + out_features=self.output_size + ) + + def _get_rnn_module(self, model_type): + if model_type == "rnn": + return torch.nn.RNN + elif model_type == "lstm": + return torch.nn.LSTM + elif model_type == "gru": + return torch.nn.GRU + + def forward(self, X): + # Dynamic RNN input for ignoring paddings + H_o, H_t = self.rnn_layer(X) + logits = self.linear_layer(H_o) + + return logits + +def rmse_error(y_true, y_pred): + """User defined root mean squared error. + Args: + - y_true: true labels + - y_pred: predictions + Returns: + - computed_rmse: computed rmse loss + """ + # Exclude masked labels + idx = (y_true >= 0) * 1 + # Mean squared loss excluding masked labels + computed_mse = np.sum(idx * ((y_true - y_pred) ** 2)) / np.sum(idx) + computed_rmse = np.sqrt(computed_mse) + return computed_rmse + +def one_step_ahead_prediction(train_data, test_data): + """Use the previous time-series to predict one-step ahead feature values. + Args: + - train_data: training time-series + - test_data: testing time-series + Returns: + - perf: average performance of one-step ahead predictions (in terms of AUC or MSE) + """ + train_data = train_data + test_data = test_data + + # Parameters + no, seq_len, dim = 256, 30, 92 + + # Set model parameters + args = {} + args["device"] = "cuda" + args["task"] = "regression" + args["model_type"] = "gru" + args["bidirectional"] = False + args["epochs"] = 20 + args["batch_size"] = 256 + args["in_dim"] = dim + args["h_dim"] = dim + args["out_dim"] = dim + args["n_layers"] = 3 + args["dropout"] = 0.5 + args["max_seq_len"] = 30 # only 29 is used for prediction + args["learning_rate"] = 1e-3 + args["grad_clip_norm"] = 5.0 + + # Set training features and labels + train_dataloader = torch.utils.data.DataLoader( + train_data, + batch_size=args["batch_size"], + shuffle=True + ) + + # Set testing features and labels + test_dataloader = torch.utils.data.DataLoader( + test_data, + batch_size=no, + shuffle=True + ) + # Initialize model + model = GeneralRNN(args) + model.to(args["device"]) + criterion = torch.nn.MSELoss() + optimizer = torch.optim.Adam( + model.parameters(), + lr=args["learning_rate"] + ) + + # Train the predictive model + logger = trange(args["epochs"], desc=f"Epoch: 0, Loss: 0") + for epoch in logger: + running_loss = 0.0 + + for train_x in train_dataloader: + train_x = train_x.to(args["device"]) + # zero the parameter gradients + optimizer.zero_grad() + # forward + train_p = model(train_x[:,0:29,:]) + loss = criterion(train_p, train_x[:,1:30,:]) + # backward + loss.backward() + # optimize + optimizer.step() + + running_loss += loss.item() + + logger.set_description(f"Epoch: {epoch}, Loss: {running_loss:.4f}") + + # Evaluate the trained model + with torch.no_grad(): + perf = 0 + for test_x in test_dataloader: + test_x = test_x.to(args["device"]) + test_p = model(test_x[:,0:29,:]).cpu() + + test_p = np.reshape(test_p.numpy(), [-1]) + test_y = np.reshape(test_x[:,1:30,:].cpu().numpy(), [-1]) + + perf += rmse_error(test_y, test_p) + + return perf \ No newline at end of file diff --git a/utils/visualization.py b/utils/visualization.py new file mode 100644 index 0000000..ef76b81 --- /dev/null +++ b/utils/visualization.py @@ -0,0 +1,83 @@ +from sklearn.manifold import TSNE +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +import numpy as np + + +def visualization(ori_data, generated_data, analysis): + """Using PCA or tSNE for generated and original data visualization. + + Args: + - ori_data: original data + - generated_data: generated synthetic data + - analysis: tsne or pca + """ + # Analysis sample size (for faster computation) + anal_sample_no = min([1000, len(ori_data)]) + idx = np.random.permutation(len(ori_data))[:anal_sample_no] + + # Data preprocessing + ori_data = np.asarray(ori_data) + generated_data = np.asarray(generated_data) + + ori_data = ori_data[idx] + generated_data = generated_data[idx] + + no, seq_len, dim = ori_data.shape + + for i in range(anal_sample_no): + if (i == 0): + prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) + prep_data_hat = np.reshape(np.mean(generated_data[0, :, :], 1), [1, seq_len]) + else: + prep_data = np.concatenate((prep_data, + np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len]))) + prep_data_hat = np.concatenate((prep_data_hat, + np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]))) + + # Visualization parameter + colors = ["tab:blue" for i in range(anal_sample_no)] + ["tab:orange" for i in range(anal_sample_no)] + + if analysis == 'pca': + # PCA Analysis + pca = PCA(n_components=2) + pca.fit(prep_data) + pca_results = pca.transform(prep_data) + pca_hat_results = pca.transform(prep_data_hat) + + # Plotting + f, ax = plt.subplots(1) + plt.scatter(pca_results[:, 0], pca_results[:, 1], + c=colors[:anal_sample_no], alpha=0.2, label="Original") + plt.scatter(pca_hat_results[:, 0], pca_hat_results[:, 1], + c=colors[anal_sample_no:], alpha=0.2, label="Synthetic") + + ax.legend() + plt.title('PCA plot') + plt.xlabel('x-pca') + plt.ylabel('y_pca') + plt.show() + + elif analysis == 'tsne': + + # Do t-SNE Analysis together + prep_data_final = np.concatenate((prep_data, prep_data_hat), axis=0) + + # TSNE anlaysis + tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) + tsne_results = tsne.fit_transform(prep_data_final) + + # Plotting + f, ax = plt.subplots(1) + + plt.scatter(tsne_results[:anal_sample_no, 0], tsne_results[:anal_sample_no, 1], + c=colors[:anal_sample_no], alpha=0.2, label="Original") + plt.scatter(tsne_results[anal_sample_no:, 0], tsne_results[anal_sample_no:, 1], + c=colors[anal_sample_no:], alpha=0.2, label="Synthetic") + + ax.legend() + + plt.title('t-SNE plot') + plt.xlabel('x-tsne') + plt.ylabel('y_tsne') + plt.show() \ No newline at end of file