diff --git a/baseline_transformer.py b/baseline_transformer.py new file mode 100644 index 0000000..eace6bd --- /dev/null +++ b/baseline_transformer.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +import pandas as pd +import numpy as np +import sys +from sklearn.preprocessing import MinMaxScaler +import tensorflow as tf +from tensorflow import keras +from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_absolute_percentage_error +from sklearn.metrics import mean_squared_error +from model import Transformer # 모델 추가 + +ENERGY = sys.argv[1] + +print('baseline_{}'.format(ENERGY)) +for num in range(0,5) : + print('{}_running'.format(num)) + + # data load + def preprocessing(data) : + feature = data.copy()[['time','elec','water','gas','hotwater','hot']] + feature.time = pd.to_datetime(feature.time) + feature.set_index('time',inplace=True) + y = data[[ENERGY]] + return feature, y + + train = pd.read_csv('train_summer.csv') + valid = pd.read_csv('valid_summer.csv') + test = pd.read_csv('test_summer.csv') + + X_train, y_train = preprocessing(train) + X_valid, y_valid = preprocessing(valid) + X_test, y_test = preprocessing(test) + + scaler = MinMaxScaler() + X_train = pd.DataFrame(scaler.fit_transform(X_train)) + X_valid = pd.DataFrame(scaler.transform(X_valid)) + X_test = pd.DataFrame(scaler.transform(X_test)) + + + # make time window + def timeseries_data(dataset, target, start_index, end_index, window_size, target_size) : + data = [] + labels = [] + + y_start_index = start_index + window_size # 0+24 + y_end_index = end_index - target_size # train_index(10291) - 24 = 10267 + + for i in range(y_start_index, y_end_index) : + data.append(dataset.iloc[i-window_size:i,:].values) + labels.append(target.iloc[i:i+target_size,:].values) + data = np.array(data) + labels = np.array(labels) + labels = labels.reshape(-1,target_size) + return data, labels + + window = 72 + X_train, y_train = timeseries_data(X_train,y_train,0,len(X_train),window,24) + X_valid, y_valid = timeseries_data(X_valid,y_valid,0,len(X_valid),window,24) + X_test, y_test = timeseries_data(X_test,y_test,0,len(X_test),window,24) + + batch_size = 32 + train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(X_train.shape[0],seed=42).batch(batch_size, drop_remainder=True).prefetch(1) + valid_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(batch_size, drop_remainder=True).prefetch(1) + test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size, drop_remainder=True).prefetch(1) + + # model architecture + class AttentionModel(keras.Model): + + def __init__(self,name="attentionmodel"): + super(AttentionModel, self).__init__(name=name) + self.encoder1 = Transformer() + self.encoder2 = Transformer() + self.encoder3 = Transformer() + self.encoder4 = Transformer() + self.encoder5 = Transformer() + self.flatten = keras.layers.Flatten() + self.hidden1 = keras.layers.Dense(512, kernel_initializer='he_normal',activation = 'relu') + self.hidden2 = keras.layers.Dense(256, kernel_initializer= 'he_normal',activation = 'relu') + self.output_ = keras.layers.Dense(24, kernel_initializer= 'he_normal') + + def call(self, input1, input2, input3, input4, input5, target, training): + out1 = self.encoder1(tf.expand_dims(input1,2),tf.expand_dims(target,2), training) + out2 = self.encoder2(tf.expand_dims(input2,2),tf.expand_dims(target,2), training) + out3 = self.encoder3(tf.expand_dims(input3,2),tf.expand_dims(target,2), training) + out4 = self.encoder4(tf.expand_dims(input4,2),tf.expand_dims(target,2), training) + out5 = self.encoder5(tf.expand_dims(input5,2),tf.expand_dims(target,2), training) + out1 = tf.expand_dims(out1,1) + out2 = tf.expand_dims(out2,1) + out3 = tf.expand_dims(out3,1) + out4 = tf.expand_dims(out4,1) + out5 = tf.expand_dims(out5,1) + concat_energy = tf.concat([out1,out2,out3,out4,out5],axis=1) + flatten = self.flatten(concat_energy) + hidden1 = self.hidden1(flatten) + hidden2 = self.hidden2(hidden1) + output = self.output_(hidden2) + + return output + + # save best model + class EarlyStopping: + """Early stops the training if validation loss doesn't improve after a given patience.""" + def __init__(self, patience=30, verbose=False, delta=0): + """ + Args: + patience (int): How long to wait after last time validation loss improved. + Default: 7 + verbose (bool): If True, prints a message for each validation loss improvement. + Default: False + delta (float): Minimum change in the monitored quantity to qualify as an improvement. + Default: 0 + """ + self.patience = patience + self.verbose = verbose + self.counter = 0 + self.best_score = None + self.early_stop = False + self.val_loss_min = np.Inf + self.delta = delta + + def __call__(self, val_loss, model): + + score = -val_loss + + if self.best_score is None: + self.best_score = score + self.save_checkpoint(val_loss, model) + elif score < self.best_score + self.delta: + self.counter += 1 + # print(f'EarlyStopping counter: {self.counter} out of {self.patience}') + # if self.counter >= self.patience: + # self.early_stop = True + else: + self.best_score = score + self.save_checkpoint(val_loss, model) + self.counter = 0 + + return self.early_stop + + def save_checkpoint(self, val_loss, model): + '''Saves model when validation loss decrease.''' + # if self.verbose: + # print(f'loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).') + print(f'loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).') + model.save_weights('./checkpoints/my_checkpoint') + self.val_loss_min = val_loss + + # learning rate scheduler + class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): + def __init__(self, d_model, warmup_steps=4000): + super(CustomSchedule, self).__init__() + self.d_model = d_model + self.d_model = tf.cast(self.d_model, tf.float32) + self.warmup_steps = warmup_steps + + def __call__(self, step): + arg1 = tf.math.rsqrt(step) + arg2 = step * (self.warmup_steps ** -1.5) + return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) + + # training hyperparamters + model = AttentionModel() + n_epochs = 500 + learning_rate = CustomSchedule(d_model=32) + optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) + loss_fn_1 = keras.losses.MeanSquaredError() + loss_fn_test = keras.losses.MeanSquaredError() + metric = keras.metrics.MeanAbsoluteError() + early = EarlyStopping() + + + # training loop + print('training start') + out1_df = [] + concat_df = [] + result_df = pd.DataFrame({'MAPE' : [], 'MAE' : [], 'RMSE' : []}) + loss_df = [] + loss_test = [] + val_loss = [] + for epoch in range(n_epochs) : + loss_batch = 0 + for batch,(features, label) in train_ds.enumerate() : + with tf.GradientTape() as tape : + y_pred = model(features[:,:,0],features[:,:,1],features[:,:,2], + features[:,:,3], features[:,:,4], label, training=True) + loss = loss_fn_1(label, y_pred) + gradients = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(gradients, model.trainable_variables)) + loss_batch += loss + loss_df.append(loss_batch) + # valid set + val_metric = 0 + sk_metric = 0 + for batch,(features, label) in valid_ds.enumerate() : + y_pred = model(features[:,:,0],features[:,:,1],features[:,:,2],features[:,:,3],features[:,:,4], + label, training=False) + error_mae = mean_absolute_error(label, y_pred) + sk_metric+=error_mae + print('sk',sk_metric,'epoch',epoch) + val_loss.append(sk_metric) + if early(sk_metric, model): + print(early(sk_metric, model)) + break + + # test model performance + model.load_weights('./checkpoints/my_checkpoint') + predict_df = [] + real_df = [] + loss_ = 0 + for batch,(features, label) in test_ds.enumerate() : + y_pred = model(features[:,:,0],features[:,:,1],features[:,:,2],features[:,:,3],features[:,:,4], + label, training=False) + loss_+=loss_fn_test(label, y_pred) + predict_df.append(y_pred) + real_df.append(label) + loss_test.append(loss_) + + real_new = [a.numpy() for b in real_df for a in b] + predict_new = [a.numpy().round(1) for b in predict_df for a in b] + + + error_mape = mean_absolute_percentage_error(real_new, predict_new) + error_mae = mean_absolute_error(real_new, predict_new) + error_rmse = mean_squared_error(real_new, predict_new)**(0.5) + result = pd.DataFrame({'MAPE' : [error_mape], 'MAE' : [error_mae], 'RMSE' : [error_rmse]}) + result_df = pd.concat([result_df,result],axis=0) + print(result_df) + + result_df.to_pickle('./base_{}/result_{}.csv'.format(ENERGY, num)) + loss_df = [i.numpy() for i in loss_df] + pd.DataFrame(loss_df).to_pickle('./base_{}/loss_{}.csv'.format(ENERGY, num)) + loss_test = [i.numpy() for i in loss_test] + pd.DataFrame(loss_test).to_pickle('./base_{}/loss_test_{}.csv'.format(ENERGY, num)) + pd.DataFrame(val_loss).to_pickle('./base_{}/loss_val_{}.csv'.format(ENERGY, num)) diff --git a/model.py b/model.py new file mode 100644 index 0000000..a566f7e --- /dev/null +++ b/model.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +import numpy as np +import tensorflow as tf +from tensorflow import keras + +class Transformer(tf.keras.Model): + def __init__(self, num_layers=2, d_model=32, num_heads=4, dff=256, input_vocab_size=1, + target_vocab_size=1, pe_input=72, pe_target=24, rate=0.1): + super().__init__() + self.encoder = Encoder(num_layers, d_model, num_heads, dff, + input_vocab_size, pe_input, rate) + + self.decoder = Decoder(num_layers, d_model, num_heads, dff, + target_vocab_size, pe_target, rate) + + self.final_layer = tf.keras.layers.Dense(target_vocab_size) + + def call(self, inp, tar, training): + # Keras models prefer if you pass all your inputs in the first argument + + look_ahead_mask = self.create_masks(tar) + + enc_output = self.encoder(inp, training) # (batch_size, inp_seq_len, d_model) + + # dec_output.shape == (batch_size, tar_seq_len, d_model) + dec_output = self.decoder( + tar, enc_output, training, look_ahead_mask) + + final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size) + + return tf.squeeze(final_output)#, attention_weights + + def create_masks(self, tar): + look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) # seq_len + return look_ahead_mask + +class Encoder(tf.keras.layers.Layer): + def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size=1, + maximum_position_encoding=72, rate=0.1): + super(Encoder, self).__init__() + + self.d_model = d_model + self.num_layers = num_layers + + # self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) + self.embedding = tf.keras.layers.Dense(d_model) + self.pos_encoding = positional_encoding(maximum_position_encoding, + self.d_model) + + self.enc_layers = [EncoderLayer(d_model, num_heads, d_ff, rate) + for _ in range(num_layers)] + + self.dropout = tf.keras.layers.Dropout(rate) + + def call(self, x, training): + seq_len = tf.shape(x)[1] #(4, 72, 1) + + # adding embedding and position encoding. + x = self.embedding(x) # (batch_size, input_seq_len, d_model) (4, 72, 32) + # x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) + x += self.pos_encoding[:, :seq_len, :] + x = self.dropout(x, training=training) #(4, 72, 32) + for i in range(self.num_layers): + x = self.enc_layers[i](x, training) + + return x # (batch_size, input_seq_len, d_model) + +class Decoder(tf.keras.layers.Layer): + def __init__(self, num_layers, d_model, num_heads, d_ff, target_vocab_size=1, + maximum_position_encoding=24, rate=0.1): + super(Decoder, self).__init__() + + self.d_model = d_model + self.num_layers = num_layers + + # self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model) + self.embedding = tf.keras.layers.Dense(d_model) + self.pos_encoding = positional_encoding(maximum_position_encoding, d_model) + + self.dec_layers = [DecoderLayer(d_model, num_heads, d_ff, rate) + for _ in range(num_layers)] + self.dropout = tf.keras.layers.Dropout(rate) + + def call(self, x, enc_output, training, + look_ahead_mask): + + seq_len = tf.shape(x)[1] + attention_weights = {} + + x = self.embedding(x) # (batch_size, target_seq_len, d_model) + # x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) + x += self.pos_encoding[:, :seq_len, :] + + x = self.dropout(x, training=training) + + for i in range(self.num_layers): + x = self.dec_layers[i](x, enc_output, training, look_ahead_mask) + + # attention_weights[f'decoder_layer{i+1}_block1'] = block1 + # attention_weights[f'decoder_layer{i+1}_block2'] = block2 + + # x.shape == (batch_size, target_seq_len, d_model) + return x + +class EncoderLayer(tf.keras.layers.Layer): + def __init__(self, d_model, num_heads, d_ff, rate=0.1): + super(EncoderLayer, self).__init__() + + self.mha = MultiHeadAttention(d_model, num_heads) + self.ffn = PositionWiseFeedForward(d_model, d_ff) + + self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + + self.dropout1 = tf.keras.layers.Dropout(rate) + self.dropout2 = tf.keras.layers.Dropout(rate) + + def call(self, x, training): + attn_output = self.mha(x, x, x) # (batch_size, input_seq_len, d_model) (4, 72, 72, 32)??? + attn_output = self.dropout1(attn_output, training=training) + out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model) + + ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model) + ffn_output = self.dropout2(ffn_output, training=training) + out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model) + + return out2 + +class DecoderLayer(tf.keras.layers.Layer): + def __init__(self, d_model, num_heads, dff, rate=0.1): + super(DecoderLayer, self).__init__() + + self.mha1 = MultiHeadAttention(d_model, num_heads) #look ahead mask + self.mha2 = MultiHeadAttention(d_model, num_heads) + + self.ffn = PositionWiseFeedForward(d_model, dff) + + self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + + self.dropout1 = tf.keras.layers.Dropout(rate) + self.dropout2 = tf.keras.layers.Dropout(rate) + self.dropout3 = tf.keras.layers.Dropout(rate) + + def call(self, x, enc_output, training, look_ahead_mask): + # enc_output.shape == (batch_size, input_seq_len, d_model) + + attn1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model) + attn1 = self.dropout1(attn1, training=training) + out1 = self.layernorm1(attn1 + x) +# print(enc_output.shape) +# print(out1.shape) +# (4, 72, 32) +# (4, 24, 32) encoder output 이랑 decoder 랑 attention 하려면 당연히 안맞는게 정상인데 어디서 맞춰줘야하지? + attn2 = self.mha2( + enc_output, enc_output, out1) # (batch_size, target_seq_len, d_model) + attn2 = self.dropout2(attn2, training=training) + out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model) + + ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model) + ffn_output = self.dropout3(ffn_output, training=training) + out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_model) + + return out3 + + +class PositionWiseFeedForward(tf.keras.layers.Layer): + def __init__(self, d_model, d_ff): + super(PositionWiseFeedForward, self).__init__() + self.dense_1 = tf.keras.layers.Dense(d_ff, activation='relu') + self.dense_2 = tf.keras.layers.Dense(d_model) + + def call(self, inputs): + inputs = self.dense_1(inputs) + output = self.dense_2(inputs) + return output + + +class MultiHeadAttention(tf.keras.layers.Layer): + + def __init__(self, d_model, num_heads, name="multi_head_attention"): + super(MultiHeadAttention, self).__init__(name=name) + self.num_heads = num_heads + self.d_model = d_model + + assert d_model % self.num_heads == 0 + + # d_model을 num_heads로 나눈 값. + # 논문 기준 : 64 + self.depth = d_model // self.num_heads + + # WQ, WK, WV에 해당하는 밀집층 정의 + self.query_dense = tf.keras.layers.Dense(units=d_model) + self.key_dense = tf.keras.layers.Dense(units=d_model) + self.value_dense = tf.keras.layers.Dense(units=d_model) + + # WO에 해당하는 밀집층 정의 + self.dense = tf.keras.layers.Dense(units=d_model) + + # num_heads 개수만큼 q, k, v를 split하는 함수 + def split_heads(self, inputs, batch_size): + inputs = tf.reshape( + inputs, shape=(batch_size, -1, self.num_heads, self.depth)) + return tf.transpose(inputs, perm=[0, 2, 1, 3]) + + def call(self, key, value, query, mask=None): + batch_size = tf.shape(query)[0] + # key shape : (4,72,72,32) + # 1. WQ, WK, WV에 해당하는 밀집층 지나기 + query = self.query_dense(query) + key = self.key_dense(key) #(4, 72, 72, 32) + value = self.value_dense(value) + + # 2. 헤드 나누기 + query = self.split_heads(query, batch_size) + key = self.split_heads(key, batch_size) + value = self.split_heads(value, batch_size) + # 3. 스케일드 닷 프로덕트 어텐션. 앞서 구현한 함수 사용. + # (batch_size, num_heads, query의 문장 길이, d_model/num_heads) + # 필요시 attention_weights output하기, 지금은 _로 처리 + scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask) #(4,4,5184,8)->(4,5184,4,8) + # (batch_size, query의 문장 길이, num_heads, d_model/num_heads) + scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) + + # 4. 헤드 연결(concatenate)하기 + # (batch_size, query의 문장 길이, d_model) + concat_attention = tf.reshape(scaled_attention, + (batch_size, -1, self.d_model)) + + # 5. WO에 해당하는 밀집층 지나기 + # (batch_size, query의 문장 길이, d_model) + outputs = self.dense(concat_attention) + + return outputs # 필요시 attention_weights output + + +def scaled_dot_product_attention(query, key, value, mask): + # query 크기 : (batch_size, num_heads, query의 문장 길이, d_model/num_heads) + # key 크기 : (batch_size, num_heads, key의 문장 길이, d_model/num_heads) + # value 크기 : (batch_size, num_heads, value의 문장 길이, d_model/num_heads) + # padding_mask : (batch_size, 1, 1, key의 문장 길이) + matmul_qk = tf.matmul(query, key, transpose_b=True) + depth = tf.cast(tf.shape(key)[-1], tf.float32) + logits = matmul_qk / tf.math.sqrt(depth) + + # masking + if mask is not None: + logits += (mask * -1e9) + + attention_weights = tf.nn.softmax(logits, axis=-1) + output = tf.matmul(attention_weights, value) + + return output, attention_weights + +def create_look_ahead_mask(size): + mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) + return mask # (seq_len, seq_len) + +def get_angles(pos, i, d_model): + angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model)) + return pos * angle_rates + +def positional_encoding(position, d_model): + angle_rads = get_angles(np.arange(position)[:, np.newaxis], + np.arange(d_model)[np.newaxis, :], + d_model) + + # apply sin to even indices in the array; 2i + angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) + + # apply cos to odd indices in the array; 2i+1 + angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) + + pos_encoding = angle_rads[np.newaxis, ...] + + return tf.cast(pos_encoding, dtype=tf.float32) + +# class PositionalEncoding(tf.keras.layers.Layer): +# def __init__(self, position, d_model): +# # model hyper parameter variables +# super(PositionalEncoding, self).__init__() +# self.pos_encoding = self.positional_encoding(position, d_model) + +# def call(self, inputs): +# output = inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :] +# return output + +# def positional_encoding(self, position, d_model): +# angle_rads = self.get_angles( +# position=tf.range(position, dtype=tf.float32)[:, tf.newaxis], +# i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :], +# d_model=d_model) + +# sines = tf.math.sin(angle_rads[:, 0::2]) +# cosines = tf.math.cos(angle_rads[:, 1::2]) + +# angle_rads = np.zeros(angle_rads.shape) +# angle_rads[:, 0::2] = sines +# angle_rads[:, 1::2] = cosines +# pos_encoding = tf.constant(angle_rads) +# pos_encoding = pos_encoding[tf.newaxis, ...] + +# return tf.cast(pos_encoding, tf.float32) + +# def get_angles(pos, i, d_model): +# angle_rates = 1 / tf.power(10000, (2 * (i//2)) / tf.float32(d_model)) +# return pos * angle_rates