diff --git a/notebooks/time_series/apple_stock_prices.livemd b/notebooks/time_series/apple_stock_prices.livemd new file mode 100644 index 00000000..be2bea90 --- /dev/null +++ b/notebooks/time_series/apple_stock_prices.livemd @@ -0,0 +1,312 @@ +# Time-series regression with RNNs + +```elixir +Mix.install([ + {:axon, "~> 0.6.1"}, + {:kino_vega_lite, "~> 0.1.13"}, + {:nx, "~> 0.7.3"}, + {:tucan, "~> 0.3.1"}, + {:polaris, "~> 0.1.0"}, + {:exla, "~> 0.7.3"}, + {:req, "~> 0.5.2"}, + {:vega_lite, "~> 0.1.9"}, + {:nimble_csv, "~> 1.2"}, + {:scholar, "~> 0.3.1"} +]) + +Nx.global_default_backend(EXLA.Backend) +Nx.Defn.global_default_options(compiler: EXLA) +``` + +## Introduction + +In this project, we will build a predictor for a time series using a Recurrent Neural Network (RNN) regressor. The prediction will be made for Apple's stock price 7 days in advance, based on historical data. + +We will use an RNN architecture known as Long Term Short Memory (LSTM). + + + +First, we load and preprocess the historical data. We will apply *normalization* to the series of historical data. This helps mitigate significant numerical issues associated with how activation functions like `tanh` transform very large numbers (whether positive or negative), and it aids in avoiding problems with calculating derivatives. + +```elixir +NimbleCSV.define(CSVParser, separator: "\n", escape: "\"") + +data = + "https://raw.githubusercontent.com/santiago-imelio/datasets/main/apple_stock_prices.csv" + |> Req.get!() + |> Map.get(:body) + |> CSVParser.parse_string() + |> Enum.map(fn [price] -> + price + |> Float.parse() + |> elem(0) + end) + |> Nx.tensor() +``` + +```elixir +min = Nx.reduce_min(data) |> Nx.to_number() +max = Nx.reduce_max(data) |> Nx.to_number() + +{series_size} = Nx.shape(data) +days = Nx.iota({series_size}) + +Tucan.lineplot([Prices: data, Days: days], "Days", "Prices") +|> Tucan.set_size(600, 400) +|> Tucan.Scale.set_y_domain(min, max) +``` + +## Train-test splitting + +Here we split the dataset into train and test sets. We will use two thirds of the data for training and the remaining for validation. Since the dataset is a ordered time-series, we shouldn't shuffle the data before splitting. + +```elixir +train_test_split = 2 / 3 + +{train, test} = Nx.split(data, train_test_split) +``` + +## Normalization + +We normalize the series to belong within the range [-1, 1] using min-max scaling. It's also common to see applications where normalization is performed using standard deviation. + +```elixir +alias Scholar.Preprocessing.MinMaxScaler +``` + +We fit a min-max scaler on the training data, and then transform both train and test sets using the fitted scaler. This prevents data leakage from the test set to the training set. + +```elixir +scaler = MinMaxScaler.fit(train, min_bound: -1, max_bound: 1) +train = MinMaxScaler.transform(scaler, train) +test = MinMaxScaler.transform(scaler, test) +``` + +Let's visualize the normalized data. + +```elixir +norm_data = Nx.concatenate([train, test]) +new_min = Nx.reduce_min(norm_data) |> Nx.to_number() +new_max = Nx.reduce_max(norm_data) |> Nx.to_number() + +Tucan.lineplot([Prices: norm_data, Days: days], "Days", "Prices") +|> Tucan.set_size(600, 400) +|> Tucan.Scale.set_y_domain(new_min, new_max) +``` + +## Sliding window + +Generally, a time-series is mathematically represented as: + +$$ +\langle s_0, s_1, s_2, \ldots, s_P \rangle +$$ + +where $s_p$ is the numerical value of the series at time interval $p$, and $P$ is the total length of the series. To apply an RNN, the prediction should be treated as a regression problem. This involves using a sliding window to construct a set of input-output pairs on which regression will be applied. + +For example, with a window size $T = 3$, the following pairs need to be produced: + +$$ +\begin{array}{c|c} + \text{Input} & \text{Output}\\ + \hline \langle s_{1},s_{2},s_{3}\rangle & s_{4} \\ + \langle s_{2},s_{3},s_{4} \rangle & s_{5} \\ + \vdots & \vdots \\ + \langle s_{P-3},s_{P-2},s_{P-1} \rangle & s_{P} +\end{array} +$$ + +In each pair, the input consists of a sequence of $T$ consecutive values from the series, and the output is the subsequent value immediately following the input sequence. This approach prepares the data for training the RNN model for time-series prediction. + +```elixir +sliding_window = fn series, win_size -> + {p} = Nx.shape(series) + + x_idx = + series + |> Nx.shape() + |> Nx.iota() + |> Nx.reshape({:auto, 1}) + |> Nx.vectorize(:elements) + + y_idx = + series + |> Nx.shape() + |> Nx.iota() + |> Nx.add(win_size) + |> Nx.reshape({:auto, 1}) + |> Nx.slice([0, 0], [p - win_size, 1]) + + x = + Nx.slice(series, [x_idx[0]], [win_size]) + |> Nx.devectorize(keep_names: false) + |> Nx.slice([0, 0], [p - win_size, win_size]) + + y = Nx.gather(series, y_idx, axes: [0]) + + {x, y} +end +``` + +Next, we can test our sliding window function with a given series. For example, we can test it using the first 10 numbers of the Fibonacci sequence, known as $F_{n+1} = F_{n - 1} + F_n$. Passing a window size of 2, we should expect input-output pairs like the following: + +$$ +\begin{array}{c|c} + \text{Input} & \text{Output}\\ + \hline \langle F_1, F_2\rangle & F_3 \\ + \langle F_{2},F_{3}\rangle & F_{4} \\ + \vdots & \vdots \\ + \langle F_{8},F_{9} \rangle & F_{10} \\ +\end{array} +$$ + +```elixir +test_series = Nx.tensor([0, 1, 1, 2, 3, 5, 8, 13, 21, 34]) +sliding_window.(test_series, 2) +``` + +Now we apply the sliding window to our dataset with a window size of 7. For simplicity, we truncate the test and train to a size divisible by the window length. + +```elixir +win_size = 7 + +{train_size} = Nx.shape(train) +{test_size} = Nx.shape(test) + +p_train = train_size - rem(train_size, win_size) +p_test = test_size - rem(test_size, win_size) + +train = train[0..(p_train - 1)] +test = test[0..(p_test - 1)] + +{x_train, y_train} = sliding_window.(train, win_size) +{x_test, y_test} = sliding_window.(test, win_size) +``` + +```elixir +dataset_size = p_train + p_test +``` + +## Building and training our RNN regressor + +We will use Axon to build a neural network with two hidden RNN layers with the following specifications: + +1. The first layer should use an LSTM module with 5 hidden units. Its `input_shape` should be `(window_size, 1)`. +2. The second layer uses a fully connected module with one unit. + +For the loss function we will use mean squared error. + +```elixir +batch_size = 6 +input_shape = {batch_size, win_size, 1} + +model = + Axon.input("input", shape: input_shape) + |> Axon.lstm(5) + |> then(fn {out, _} -> out end) + |> Axon.nx(fn t -> t[[0..-1//1, -1]] end) + |> Axon.dense(1) + +Axon.Display.as_graph(model, Nx.template(input_shape, :f32)) +``` + +Before training we split our train and test data in batches using the specified batch size. + +```elixir +y_train_batches = Nx.to_batched(y_train, batch_size) + +x_train_batches = + x_train + |> Nx.reshape({:auto, win_size, 1}) + |> Nx.to_batched(batch_size) + +y_test_batches = Nx.to_batched(y_test, batch_size) + +x_test_batches = + x_test + |> Nx.reshape({:auto, win_size, 1}) + |> Nx.to_batched(batch_size) + +train_data = Stream.zip([x_train_batches, y_train_batches]) +test_data = Stream.zip([x_test_batches, y_test_batches]) +``` + +We are ready to create our training loop and run it. Since our network is simple, we choose SGD as the network optimizer. + +```elixir +optimizer = Polaris.Optimizers.sgd(learning_rate: 0.04) +loop = Axon.Loop.trainer(model, :mean_squared_error, optimizer) + +trained_model_state = Axon.Loop.run(loop, train_data, %{}, epochs: 100) +``` + +Finally, we make predictions over the train and test sets so we can evaluate our model. + +```elixir +y_pred_train = + x_train_batches + |> Enum.map(&Axon.predict(model, trained_model_state, &1)) + |> Nx.concatenate() + +y_pred_test = + x_test_batches + |> Enum.map(&Axon.predict(model, trained_model_state, &1)) + |> Nx.concatenate() +``` + +```elixir +y_train +|> Axon.Losses.mean_squared_error(y_pred_train, reduction: :mean) +|> Nx.to_number() +|> IO.inspect(label: "MSE train") + +y_test +|> Axon.Losses.mean_squared_error(y_pred_test, reduction: :mean) +|> Nx.to_number() +|> IO.inspect(label: "MSE test") + +:ok +``` + +## Visualizing predictions + +Let's visualize the predictions compared to the real trend of the prices. First we build the plot data required for each layer. + +```elixir +interval = Nx.iota({dataset_size}) + +plt_data = [ + Prices: norm_data[0..(dataset_size - 1)], + Day: interval[0..(dataset_size - 1)], + Eval: List.duplicate("Real prices", dataset_size) +] + +{train_size} = Nx.shape(y_train) +{test_size} = Nx.shape(y_test) + +plt_data_train = [ + Prices: Nx.to_flat_list(y_pred_train), + Day: interval[(win_size - 1)..(train_size + win_size - 1)], + Eval: List.duplicate("Predictions train", train_size) +] + +plt_data_test = [ + Prices: Nx.to_flat_list(y_pred_test), + Day: interval[(dataset_size - test_size)..(dataset_size - 1)], + Eval: List.duplicate("Predictions test", test_size) +] + +opts = [stroke_width: 2] +``` + +```elixir +Tucan.layers([ + Tucan.lineplot(plt_data, "Day", "Prices", Keyword.put(opts, :stroke_dash, [2])), + Tucan.lineplot(plt_data_train, "Day", "Prices", opts), + Tucan.lineplot(plt_data_test, "Day", "Prices", opts) +]) +|> Tucan.color_by("Eval") +|> Tucan.set_size(600, 400) +|> Tucan.Scale.set_x_domain(0, dataset_size - 1) +|> VegaLite.encode(:color, field: "Eval", scale: [scheme: "set1"]) +```