From 4f6ebe901416a8316039ed7bddfa410df50442c6 Mon Sep 17 00:00:00 2001 From: Vishal Date: Thu, 30 May 2024 20:59:57 +0200 Subject: [PATCH] feat: add working train and predict recipes --- src/dataset.py | 61 +++++++++++++++++++++++++++++++---- src/models.py | 58 ++++++++++++++++++++++++++++++++++ src/predict.py | 44 ++++++++++++++++++++++++++ src/train.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/utils.py | 13 ++++++++ 5 files changed, 255 insertions(+), 7 deletions(-) create mode 100644 src/predict.py create mode 100644 src/train.py diff --git a/src/dataset.py b/src/dataset.py index 9caa07c..a6450e4 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,11 +1,13 @@ +from abc import ABC, abstractmethod +import random from itertools import groupby from operator import itemgetter from typing import List, Tuple import numpy as np import pandas as pd -from abc import ABC, abstractmethod -import random +import joblib +from sklearn.preprocessing import StandardScaler class Dataset: @@ -54,7 +56,28 @@ def split_data(self): return train, test, val -class TimeSeriesScaler: +class DataScaler(ABC): + def __init__(self) -> None: + super().__init__() + + @abstractmethod + def scaler_fit(self): + pass + + @abstractmethod + def scaler_transform(self): + pass + + @abstractmethod + def save_scaler(self): + pass + + @abstractmethod + def load_scaler(self): + pass + + +class TimeSeriesScaler(DataScaler): def __init__( self, @@ -69,6 +92,7 @@ def __init__( self.other_columns = other_columns self.original_target_column = original_target_column self.duplicated_target_column = duplicated_target_column + super().__init__() def copy_target_column(self, _df): _df.loc[_df.index, self.duplicated_target_column] = _df[ @@ -76,8 +100,12 @@ def copy_target_column(self, _df): ] return _df - def scaler_fit(self, scaler, X): - self.scaler = scaler.fit(X[self.continous_features].values) + def scaler_fit(self, scaler_type, X): + if scaler_type == "minmax": + self.scaler = StandardScaler() + else: + raise NotImplementedError + self.scaler.fit(X[self.continous_features].values) return self.scaler def scaler_transform(self, X): @@ -93,6 +121,13 @@ def scaler_transform(self, X): ] return scaled_features_df + def save_scaler(self, path): + joblib.dump(self.scaler, path) + + def load_scaler(self, path): + scaler = joblib.load(path) + self.scaler = scaler + class TimeSeriesFormatter: @@ -112,6 +147,13 @@ def __init__( self.auto_regressive = auto_regressive self.inference = inference + @staticmethod + def reshape_x(X, W=None, use_static=False): + X_reshaped = np.reshape(X, (X.shape[0], -1)) + if use_static: + X_reshaped = np.hstack((W, X_reshaped)) + return X_reshaped + def split_sequences( self, sequences: np.ndarray, @@ -175,6 +217,11 @@ def format_data(self, df): y_list.extend(y) z_list.extend(z) if self.inference: - return W_list, X_list, z_list + return np.array(W_list), np.array(X_list), np.array(z_list) else: - return W_list, X_list, y_list, z_list + return ( + np.array(W_list), + np.array(X_list), + np.array(y_list), + np.array(z_list), + ) diff --git a/src/models.py b/src/models.py index 2a8fbd0..e5bf75e 100644 --- a/src/models.py +++ b/src/models.py @@ -1,7 +1,13 @@ from __future__ import annotations from abc import ABC, abstractmethod +from typing import Any + +import numpy as np +import pickle from sklearn.neighbors import KNeighborsRegressor +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import cross_val_score class Model(ABC): @@ -14,6 +20,18 @@ def train_model(self): def predict_model(self): pass + @abstractmethod + def cross_validation(self): + pass + + @abstractmethod + def save_model(self): + pass + + @abstractmethod + def load_model(self): + pass + class KNNModel(Model): @@ -26,3 +44,43 @@ def train_model(self, X, y): def predict_model(self, X): return self.model.predict(X) + + def cross_validation(self, X, y, lower_k, upper_k): + k_values = list(range(lower_k, upper_k)) + cv_scores = [ + np.mean( + cross_val_score( + KNeighborsRegressor( + n_neighbors=k, weights="distance", algorithm="auto", p=2 + ), + X, + y, + cv=5, + verbose=3, + scoring="neg_root_mean_squared_error", + ) + ) + for k in k_values + ] + optimal_k = k_values[np.argmax(cv_scores)] + return optimal_k + + def save_model(self, path): + saved_model = open(path, "wb") + pickle.dump(self.model, saved_model) + saved_model.close() + + def load_model(self, path): + self.model = pickle.load(open(path, "rb")) + + +class EvaluationMetrics: + def __init__(self, y, y_hat) -> None: + self.y = y + self.y_hat = y_hat + + def mse(self): + return mean_squared_error(self.y, self.y_hat) + + def rmse(self): + return np.sqrt(self.mse()) diff --git a/src/predict.py b/src/predict.py new file mode 100644 index 0000000..ca97199 --- /dev/null +++ b/src/predict.py @@ -0,0 +1,44 @@ +from pathlib import Path +from config_model import * +from models import KNNModel +from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter +from utils import setup_logging, predicitons_to_df + +inference_date = INFERENCE_INPUT_DATE +inference_file = file_processed_input + +lb, ph = (MODEL_PARAMS["lb"], MODEL_PARAMS["ph"]) +# Set up logging +logging = setup_logging("predict.log") + +if __name__ == "__main__": + + data_object = DataSplitter(inference_file) + X_formatted = data_object.df + + time_series_object = TimeSeriesScaler( + continous_features, + categorical_features, + other_columns, + target_as_autoregressive_feature, + target_column, + ) + scaler = time_series_object.load_scaler("artifacts/minmax_scaler.gz") + scaled_test = time_series_object.scaler_transform(X_formatted) + + series_formatter_obj = TimeSeriesFormatter( + lb, ph, static_features, dynamic_features, True, True + ) + + W_test, X_test, z_test = series_formatter_obj.format_data(scaled_test) + X_test = TimeSeriesFormatter.reshape_x(X_test) + + traffic_model = KNNModel() + traffic_model.load_model("artifacts/knn_model") + + y_test_hat = traffic_model.predict_model(X_test) + + df_test = predicitons_to_df(ph, z_test, y_test_hat) + df_test.to_csv( + Path("..") / "predictions" / f"knn_{INFERENCE_PREDICTION_DATE}.csv", index=False + ) diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..5b3537a --- /dev/null +++ b/src/train.py @@ -0,0 +1,86 @@ +from config_model import * +from utils import setup_logging +from models import KNNModel, EvaluationMetrics +from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter + +# Set up logging +logging = setup_logging("train.log") + + +if __name__ == "__main__": + + data_object = DataSplitter(data_path) + X_formatted = data_object.df + + for lb, ph in [(MODEL_PARAMS["lb"], MODEL_PARAMS["ph"])]: + det_ids = data_object.get_groups + + seed = CONFIG["seed"] + validation_prop = CONFIG["validation_proportion"] + test_prop = CONFIG["test_proportion"] + + data_object.split_groups(seed, validation_prop, test_prop) + + X_formatted_train, X_formatted_val, X_formatted_test = data_object.split_data() + + time_series_object = TimeSeriesScaler( + continous_features, + categorical_features, + other_columns, + target_as_autoregressive_feature, + target_column, + ) + + (X_formatted_train, X_formatted_val, X_formatted_test) = [ + time_series_object.copy_target_column(df) + for df in (X_formatted_train, X_formatted_val, X_formatted_test) + ] + + scaler = time_series_object.scaler_fit("minmax", X_formatted_train) + + (scaled_train, scaled_val, scaled_test) = [ + time_series_object.scaler_transform(df) + for df in (X_formatted_train, X_formatted_val, X_formatted_test) + ] + + series_formatter_obj = TimeSeriesFormatter( + lb, ph, static_features, dynamic_features, True, False + ) + + W_train, X_train, y_train, z_train = series_formatter_obj.format_data( + scaled_train + ) + W_val, X_val, y_val, z_val = series_formatter_obj.format_data(scaled_val) + + W_test, X_test, y_test, z_test = series_formatter_obj.format_data(scaled_test) + + logging.info(f"Column order: {scaled_train.columns}") + + lookback_timesteps = lb + prediction_horizon = ph + + X_train = TimeSeriesFormatter.reshape_x(X_train) + X_val = TimeSeriesFormatter.reshape_x(X_val) + X_test = TimeSeriesFormatter.reshape_x(X_test) + + optimal_k = 8 + + traffic_model = KNNModel( + n_neighbors=optimal_k, weights="uniform", algorithm="kd_tree", p=2 + ) + traffic_model.train_model(X_train, y_train) + + y_train_hat = traffic_model.predict_model(X_train) + train_rmse = EvaluationMetrics(y_train, y_train_hat).rmse() + print("RMSE on Train Set:", train_rmse) + + y_val_hat = traffic_model.predict_model(X_val) + val_rmse = EvaluationMetrics(y_val, y_val_hat).rmse() + print("RMSE on Validation Set:", val_rmse) + + y_test_hat = traffic_model.predict_model(X_test) + test_rmse = EvaluationMetrics(y_test, y_test_hat).rmse() + print("RMSE on Test Set:", test_rmse) + + traffic_model.save_model("artifacts/knn_model") + time_series_object.save_scaler("artifacts/minmax_scaler.gz") diff --git a/src/utils.py b/src/utils.py index 8f64c4d..e7af2c1 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,4 +1,8 @@ import logging +from itertools import repeat + +import numpy as np +import pandas as pd def setup_logging(file_name="logfile.log"): @@ -11,3 +15,12 @@ def setup_logging(file_name="logfile.log"): filemode="w", ) return logging + + +def predicitons_to_df(ph, z_test, y_test_hat): + df_test = pd.DataFrame({"paris_id": [x for x in z_test for _ in repeat(None, ph)]}) + + df_test["time_idx"] = np.tile(np.arange(ph), len(z_test)) + df_test["preds"] = np.ravel(y_test_hat) + df_test["preds"] = df_test["preds"].astype(int) + return df_test