-
Notifications
You must be signed in to change notification settings - Fork 2
/
train.py
124 lines (87 loc) · 4.21 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
import os ,datetime
batch_size = 32
seq_len = 128
d_k = 256
d_v = 256
n_heads = 12
ff_dim = 256
" data download https://finance.yahoo.com/quote/IBM/history?period1=950400&period2=1594512000&interval=1d&filter=history&frequency=1d"
df = pd.read_csv('../input/IBM.csv', delimiter=',', usecols=['Date', 'Open', 'High', 'Low', 'Close', 'Volume'])
# Replace 0 to avoid dividing by 0 later on
df['Volume'].replace(to_replace=0, method='ffill', inplace=True)
df.sort_values('Date', inplace=True)
# Apply moving average with a window of 10 days to all columns
df[['Open', 'High', 'Low', 'Close', 'Volume']] = df[['Open', 'High', 'Low', 'Close', 'Volume']].rolling(10).mean()
# Drop all rows with NaN values
df.dropna(how='any', axis=0, inplace=True)
'''Calculate percentage change'''
df['Open'] = df['Open'].pct_change() # Create arithmetic returns column
df['High'] = df['High'].pct_change() # Create arithmetic returns column
df['Low'] = df['Low'].pct_change() # Create arithmetic returns column
df['Close'] = df['Close'].pct_change() # Create arithmetic returns column
df['Volume'] = df['Volume'].pct_change()
df.dropna(how='any', axis=0, inplace=True) # Drop all rows with NaN values
'''Normalize price columns'''
min_return = min(df[['Open', 'High', 'Low', 'Close']].min(axis=0))
max_return = max(df[['Open', 'High', 'Low', 'Close']].max(axis=0))
# Min-max normalize price columns (0-1 range)
df['Open'] = (df['Open'] - min_return) / (max_return - min_return)
df['High'] = (df['High'] - min_return) / (max_return - min_return)
df['Low'] = (df['Low'] - min_return) / (max_return - min_return)
df['Close'] = (df['Close'] - min_return) / (max_return - min_return)
'''Normalize volume column'''
min_volume = df['Volume'].min(axis=0)
max_volume = df['Volume'].max(axis=0)
# Min-max normalize volume columns (0-1 range)
df['Volume'] = (df['Volume'] - min_volume) / (max_volume - min_volume)
'''Create training, validation and test split'''
times = sorted(df.index.values)
last_10pct = sorted(df.index.values)[-int(0.1*len(times))] # Last 10% of series
last_20pct = sorted(df.index.values)[-int(0.2*len(times))] # Last 20% of series
df_train = df[(df.index < last_20pct)] # Training data are 80% of total data
df_val = df[(df.index >= last_20pct) & (df.index < last_10pct)]
df_test = df[(df.index >= last_10pct)]
# Remove date column
df_train.drop(columns=['Date'], inplace=True)
df_val.drop(columns=['Date'], inplace=True)
df_test.drop(columns=['Date'], inplace=True)
# Convert pandas columns into arrays
train_data = df_train.values
val_data = df_val.values
test_data = df_test.values
# Training data
X_train, y_train = [], []
for i in range(seq_len, len(train_data)):
X_train.append(train_data[i-seq_len:i]) # Chunks of training data with a length of 128 df-rows
y_train.append(train_data[:, 3][i]) #Value of 4th column (Close Price) of df-row 128+1
X_train, y_train = np.array(X_train), np.array(y_train)
###############################################################################
# Validation data
X_val, y_val = [], []
for i in range(seq_len, len(val_data)):
X_val.append(val_data[i-seq_len:i])
y_val.append(val_data[:, 3][i])
X_val, y_val = np.array(X_val), np.array(y_val)
###############################################################################
# Test data
X_test, y_test = [], []
for i in range(seq_len, len(test_data)):
X_test.append(test_data[i-seq_len:i])
y_test.append(test_data[:, 3][i])
X_test, y_test = np.array(X_test), np.array(y_test)
import model
model = model.create_model()
callback = tf.keras.callbacks.ModelCheckpoint('Transformer+TimeEmbedding_avg.hdf5',
monitor='val_loss',
save_best_only=True,
verbose=1)
history = model.fit(X_train, y_train,
batch_size=batch_size,
epochs=35,
callbacks=[callback],
validation_data=(X_val, y_val))