diff --git a/README.md b/README.md index a344739..a09f4ba 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,6 @@ python run_vrae.py -- 코드 작성자 : 박경찬 - pyTorch implementation for `TimeGAN` - Code Reference : https://github.com/d9n13lt4n/timegan-pytorch @@ -37,7 +36,6 @@ python run_vrae.py -- 코드 작성자 : 정의석 - pyTorch implementation for `VRAE` - Code Reference : https://github.com/tejaslodaya/timeseries-clustering-vae @@ -68,7 +66,10 @@ TimeGAN has 2 Modes, which is used to decide whether to train or generate : VRAE has 3 Modes, which is used to decide whether to train or generate(train) or generate(test) : 1. is_train (default = True) : train model with loaded train data (window_size=30, stride=1) 2. is_generate_train (default = True) : generate train dataset loaded sequentially (window_size=stride) -3. is_generate_test (default = True) : generate test dataset loaded sequentially (window_size=stride) +3. is_generate_test (default = False) : generate test dataset loaded sequentially (window_size=stride) + +✨ The query for train/test split in my code is currently used for my side-project. +✨ If you want to use train/test you need to go to `utils.custom_dataset` and change the query. ``` # Mode 1 : Train mode diff --git a/config_vrae.py b/config_vrae.py index c462cbc..7c6fe31 100644 --- a/config_vrae.py +++ b/config_vrae.py @@ -29,7 +29,22 @@ def parser_setting(parser): parser.add_argument( '--file_name', default='netis', - type=str) + type=str) + parser.add_argument( + '--cols_to_remove', + default='Time', + type=str, + nargs='*', + help = 'Columns to Remove') + parser.add_argument( + "--split", + type=str2bool, + default=False, + help = 'Argument for Train/Test split') + parser.add_argument( + '--time_gap', + default=100, + type=int) # train/generate argument parser.add_argument( @@ -40,10 +55,12 @@ def parser_setting(parser): "--is_generate_train", type=str2bool, default=True) + + # TODO : For further testing parser.add_argument( "--is_generate_test", type=str2bool, - default=True) + default=False) # rescaling argument parser.add_argument( diff --git a/models/vrae.py b/models/vrae.py index 4d076fa..ade24df 100644 --- a/models/vrae.py +++ b/models/vrae.py @@ -327,7 +327,7 @@ def _train(self, train_loader): if (t + 1) % self.print_every == 0: print('Batch %d, loss = %.4f, recon_loss = %.4f, kl_loss = %.4f' % (t + 1, loss.item(), recon_loss.item(), kl_loss.item())) - + print('Average loss: {:.4f}'.format(epoch_loss / t)) return epoch_loss / t diff --git a/run_vrae.py b/run_vrae.py index 0eeec1c..846ec4d 100644 --- a/run_vrae.py +++ b/run_vrae.py @@ -38,21 +38,23 @@ WINDOW_SIZE = args.window_size # Window size scale_type = args.scale_type # Scaler Type 설정 ('Standard' or 'MinMax' or 'Robust') undo = args.undo # reconstruction 시 unscale 수행여부 +cols_to_remove = args.cols_to_remove # 제거할 변수 +split = args.split # TRAIN/TEST Split 여부 +time_gap = args.time_gap # 데이터수집단위 + # Load & Scale data -TRAIN_DF, TEST_DF, TRAIN_SCALED, TEST_SCALED, TRAIN_Time, TEST_Time, cols, scaler = load_gen_data(file_name = file_name, scale_type = scale_type) +TRAIN_DF, TEST_DF, TRAIN_SCALED, TEST_SCALED, TRAIN_Time, TEST_Time, cols, scaler = load_gen_data(file_name = file_name, \ + scale_type = scale_type,\ + cols_to_remove = cols_to_remove,\ + split = split) # under custom_dataset.py ## Train dataset with stride -train_dataset = NetisDataset(data = TRAIN_SCALED, timestamps = TRAIN_Time, window_size = WINDOW_SIZE, stride =1) - -## Test dataset with no window collapse (for generation window size must be WINDOW_SIZE) -train_gen_dataset = NetisDataset(data = TRAIN_SCALED, timestamps = TRAIN_Time, window_size = WINDOW_SIZE, stride = WINDOW_SIZE) - -test_gen_dataset = NetisDataset(data = TEST_SCALED, timestamps = TEST_Time, window_size = WINDOW_SIZE, stride = WINDOW_SIZE) +train_dataset = TimeSeriesDataset(data = TRAIN_SCALED, timestamps = TRAIN_Time, window_size = WINDOW_SIZE, stride =1, time_gap = time_gap) # SET ARGUMENTS -args.dload = "./save_model" +args.dload = "./save_model_test" # CHANGE BACK TO NORMAL AFTER TESTING args.sequence_length = WINDOW_SIZE args.number_of_features = train_dataset[0].shape[1] @@ -78,6 +80,10 @@ # TRAIN dataset reconstruction if args.is_generate_train: + + # define train generation data + train_gen_dataset = TimeSeriesDataset(data = TRAIN_SCALED, timestamps = TRAIN_Time, window_size = WINDOW_SIZE, stride = WINDOW_SIZE, time_gap = time_gap) + # FOR GENERATION MUST HAVE batch_size 1 args.batch_size = 1 @@ -128,6 +134,10 @@ # TEST dataset reconstruction if args.is_generate_test: + + # define test generation data + test_gen_dataset = TimeSeriesDataset(data = TEST_SCALED, timestamps = TEST_Time, window_size = WINDOW_SIZE, stride = WINDOW_SIZE, time_gap = time_gap) + # FOR GENERATION MUST HAVE batch_size 1 args.batch_size = 1 diff --git a/save_model_test/VRAE_MinMax_un_False_hidden_1_win_10_ep_10.pth b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_10_ep_10.pth new file mode 100644 index 0000000..11e027a Binary files /dev/null and b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_10_ep_10.pth differ diff --git a/save_model_test/VRAE_MinMax_un_False_hidden_1_win_30_ep_1.pth b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_30_ep_1.pth new file mode 100644 index 0000000..f62e8b9 Binary files /dev/null and b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_30_ep_1.pth differ diff --git a/save_model_test/VRAE_MinMax_un_False_hidden_1_win_5_ep_5.pth b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_5_ep_5.pth new file mode 100644 index 0000000..7de8586 Binary files /dev/null and b/save_model_test/VRAE_MinMax_un_False_hidden_1_win_5_ep_5.pth differ diff --git a/utils/custom_dataset.py b/utils/custom_dataset.py index 11917cc..6ecc801 100644 --- a/utils/custom_dataset.py +++ b/utils/custom_dataset.py @@ -20,43 +20,50 @@ def fix_seed(seed: int) -> None: random.seed(seed) # load generation data -def load_gen_data(file_name, scale_type = 'Standard', cols_to_remove = None): +def load_gen_data(file_name, scale_type = 'MinMax',\ + cols_to_remove = ['Time'], split = False): """ file_name: file_name in data location + scale_type : choose scaling type """ - # define path(must be in pkl file) - data_loc = f'./data/netis/{file_name}.pkl' + # define path(must be uder data folder and in pkl file) + data_loc = f'./data/{file_name}.pkl' # get data with open(data_loc, 'rb') as f: df = pickle.load(f) - # if needed remove columns that is not necessary - if cols_to_remove != None: - df = df_total.drop(cols_to_remove, axis=1) - + # 결측치 제거 df = df.dropna() - # TRAIN TEST SPLIT - # TRAIN - TRAIN_DF = df.query('Time < 20211103184400 or Time > 20211106084400 and label==0') - - # TEST(GET ONLY 정상) - TEST_DF = df.query('Time >= 20211103184400 and Time <= 20211106084400 and label==0') - - TOTAL_DF = df.to_numpy() - - # REMOVE TIME & LABEL + # TRAIN/TEST SPLIT 없으면 전체 데이터 사용 + if split == True : + # Netis 데이터 기준으로 작성되어 있기때문에 도메인에 맞도록 Train/Test Split을 진행함 + TOTAL_DF = df + TRAIN_DF = df.query('Time < 20211103184400 or Time > 20211106084400 and label==0') + TEST_DF = df.query('Time >= 20211103184400 and Time <= 20211106084400 and label==0') + else: + TOTAL_DF = df + TRAIN_DF = df + TEST_DF = df + + # 시간 정보 저장 TRAIN_Time = TRAIN_DF['Time'] TEST_Time = TEST_DF['Time'] - # remove time & label - TRAIN_DF = TRAIN_DF.iloc[:,1:-1] - TEST_DF = TEST_DF.iloc[:,1:-1] + # if needed remove columns that is not necessary + # 지정 변수 제거 + if cols_to_remove != None: + TOTAL_DF = TOTAL_DF.drop(cols_to_remove, axis=1) + TRAIN_DF = TRAIN_DF.drop(cols_to_remove, axis=1) + TEST_DF = TEST_DF.drop(cols_to_remove, axis=1) - cols = TRAIN_DF.columns + # Get column Info + cols = TOTAL_DF.columns + # To numpy + TOTAL_DF = TOTAL_DF.to_numpy() TRAIN_DF = TRAIN_DF.to_numpy() TEST_DF = TEST_DF.to_numpy() @@ -74,22 +81,9 @@ def load_gen_data(file_name, scale_type = 'Standard', cols_to_remove = None): return TRAIN_DF, TEST_DF, TRAIN_SCALED, TEST_SCALED, TRAIN_Time, TEST_Time, cols, scaler -# with no window collapsing -class GenerationDataset(Dataset): - def __init__(self, data, window): - self.data = torch.Tensor(data) - self.window = window - - def __len__(self): - return len(self.data) // self.window # -1 - - def __getitem__(self, index): - x = self.data[index*self.window:(index+1)*(self.window)] - return x - # loader with stride -class NetisDataset(Dataset): - def __init__(self, data, timestamps, window_size, stride=1): +class TimeSeriesDataset(Dataset): + def __init__(self, data, timestamps, window_size, stride=1, time_gap=100): self.data = torch.from_numpy(np.array(data)) self.ts = np.array(timestamps) self.valid_idxs = [] @@ -100,7 +94,7 @@ def __init__(self, data, timestamps, window_size, stride=1): R = L + self.window_size - 1 # append val indexs - if self.ts[R]-self.ts[L] == (self.window_size-1)*100: + if self.ts[R]-self.ts[L] == (self.window_size-1)*time_gap: self.valid_idxs.append(L) self.valid_idxs = np.array(self.valid_idxs, dtype=np.int32)[::stride] @@ -116,3 +110,15 @@ def __getitem__(self, index): return x.float() +# with no window collapsing +class GenerationDataset(Dataset): + def __init__(self, data, window): + self.data = torch.Tensor(data) + self.window = window + + def __len__(self): + return len(self.data) // self.window # -1 + + def __getitem__(self, index): + x = self.data[index*self.window:(index+1)*(self.window)] + return x \ No newline at end of file