Skip to content

Commit

Permalink
feat: add working train and predict recipes
Browse files Browse the repository at this point in the history
  • Loading branch information
vishalmhjn committed May 30, 2024
1 parent 6e62ff8 commit 4f6ebe9
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 7 deletions.
61 changes: 54 additions & 7 deletions src/dataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from abc import ABC, abstractmethod
import random
from itertools import groupby
from operator import itemgetter
from typing import List, Tuple
import numpy as np
import pandas as pd

from abc import ABC, abstractmethod
import random
import joblib
from sklearn.preprocessing import StandardScaler


class Dataset:
Expand Down Expand Up @@ -54,7 +56,28 @@ def split_data(self):
return train, test, val


class TimeSeriesScaler:
class DataScaler(ABC):
def __init__(self) -> None:
super().__init__()

@abstractmethod
def scaler_fit(self):
pass

@abstractmethod
def scaler_transform(self):
pass

@abstractmethod
def save_scaler(self):
pass

@abstractmethod
def load_scaler(self):
pass


class TimeSeriesScaler(DataScaler):

def __init__(
self,
Expand All @@ -69,15 +92,20 @@ def __init__(
self.other_columns = other_columns
self.original_target_column = original_target_column
self.duplicated_target_column = duplicated_target_column
super().__init__()

def copy_target_column(self, _df):
_df.loc[_df.index, self.duplicated_target_column] = _df[
self.original_target_column
]
return _df

def scaler_fit(self, scaler, X):
self.scaler = scaler.fit(X[self.continous_features].values)
def scaler_fit(self, scaler_type, X):
if scaler_type == "minmax":
self.scaler = StandardScaler()
else:
raise NotImplementedError
self.scaler.fit(X[self.continous_features].values)
return self.scaler

def scaler_transform(self, X):
Expand All @@ -93,6 +121,13 @@ def scaler_transform(self, X):
]
return scaled_features_df

def save_scaler(self, path):
joblib.dump(self.scaler, path)

def load_scaler(self, path):
scaler = joblib.load(path)
self.scaler = scaler


class TimeSeriesFormatter:

Expand All @@ -112,6 +147,13 @@ def __init__(
self.auto_regressive = auto_regressive
self.inference = inference

@staticmethod
def reshape_x(X, W=None, use_static=False):
X_reshaped = np.reshape(X, (X.shape[0], -1))
if use_static:
X_reshaped = np.hstack((W, X_reshaped))
return X_reshaped

def split_sequences(
self,
sequences: np.ndarray,
Expand Down Expand Up @@ -175,6 +217,11 @@ def format_data(self, df):
y_list.extend(y)
z_list.extend(z)
if self.inference:
return W_list, X_list, z_list
return np.array(W_list), np.array(X_list), np.array(z_list)
else:
return W_list, X_list, y_list, z_list
return (
np.array(W_list),
np.array(X_list),
np.array(y_list),
np.array(z_list),
)
58 changes: 58 additions & 0 deletions src/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any

import numpy as np
import pickle

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


class Model(ABC):
Expand All @@ -14,6 +20,18 @@ def train_model(self):
def predict_model(self):
pass

@abstractmethod
def cross_validation(self):
pass

@abstractmethod
def save_model(self):
pass

@abstractmethod
def load_model(self):
pass


class KNNModel(Model):

Expand All @@ -26,3 +44,43 @@ def train_model(self, X, y):

def predict_model(self, X):
return self.model.predict(X)

def cross_validation(self, X, y, lower_k, upper_k):
k_values = list(range(lower_k, upper_k))
cv_scores = [
np.mean(
cross_val_score(
KNeighborsRegressor(
n_neighbors=k, weights="distance", algorithm="auto", p=2
),
X,
y,
cv=5,
verbose=3,
scoring="neg_root_mean_squared_error",
)
)
for k in k_values
]
optimal_k = k_values[np.argmax(cv_scores)]
return optimal_k

def save_model(self, path):
saved_model = open(path, "wb")
pickle.dump(self.model, saved_model)
saved_model.close()

def load_model(self, path):
self.model = pickle.load(open(path, "rb"))


class EvaluationMetrics:
def __init__(self, y, y_hat) -> None:
self.y = y
self.y_hat = y_hat

def mse(self):
return mean_squared_error(self.y, self.y_hat)

def rmse(self):
return np.sqrt(self.mse())
44 changes: 44 additions & 0 deletions src/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from pathlib import Path
from config_model import *
from models import KNNModel
from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter
from utils import setup_logging, predicitons_to_df

inference_date = INFERENCE_INPUT_DATE
inference_file = file_processed_input

lb, ph = (MODEL_PARAMS["lb"], MODEL_PARAMS["ph"])
# Set up logging
logging = setup_logging("predict.log")

if __name__ == "__main__":

data_object = DataSplitter(inference_file)
X_formatted = data_object.df

time_series_object = TimeSeriesScaler(
continous_features,
categorical_features,
other_columns,
target_as_autoregressive_feature,
target_column,
)
scaler = time_series_object.load_scaler("artifacts/minmax_scaler.gz")
scaled_test = time_series_object.scaler_transform(X_formatted)

series_formatter_obj = TimeSeriesFormatter(
lb, ph, static_features, dynamic_features, True, True
)

W_test, X_test, z_test = series_formatter_obj.format_data(scaled_test)
X_test = TimeSeriesFormatter.reshape_x(X_test)

traffic_model = KNNModel()
traffic_model.load_model("artifacts/knn_model")

y_test_hat = traffic_model.predict_model(X_test)

df_test = predicitons_to_df(ph, z_test, y_test_hat)
df_test.to_csv(
Path("..") / "predictions" / f"knn_{INFERENCE_PREDICTION_DATE}.csv", index=False
)
86 changes: 86 additions & 0 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from config_model import *
from utils import setup_logging
from models import KNNModel, EvaluationMetrics
from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter

# Set up logging
logging = setup_logging("train.log")


if __name__ == "__main__":

data_object = DataSplitter(data_path)
X_formatted = data_object.df

for lb, ph in [(MODEL_PARAMS["lb"], MODEL_PARAMS["ph"])]:
det_ids = data_object.get_groups

seed = CONFIG["seed"]
validation_prop = CONFIG["validation_proportion"]
test_prop = CONFIG["test_proportion"]

data_object.split_groups(seed, validation_prop, test_prop)

X_formatted_train, X_formatted_val, X_formatted_test = data_object.split_data()

time_series_object = TimeSeriesScaler(
continous_features,
categorical_features,
other_columns,
target_as_autoregressive_feature,
target_column,
)

(X_formatted_train, X_formatted_val, X_formatted_test) = [
time_series_object.copy_target_column(df)
for df in (X_formatted_train, X_formatted_val, X_formatted_test)
]

scaler = time_series_object.scaler_fit("minmax", X_formatted_train)

(scaled_train, scaled_val, scaled_test) = [
time_series_object.scaler_transform(df)
for df in (X_formatted_train, X_formatted_val, X_formatted_test)
]

series_formatter_obj = TimeSeriesFormatter(
lb, ph, static_features, dynamic_features, True, False
)

W_train, X_train, y_train, z_train = series_formatter_obj.format_data(
scaled_train
)
W_val, X_val, y_val, z_val = series_formatter_obj.format_data(scaled_val)

W_test, X_test, y_test, z_test = series_formatter_obj.format_data(scaled_test)

logging.info(f"Column order: {scaled_train.columns}")

lookback_timesteps = lb
prediction_horizon = ph

X_train = TimeSeriesFormatter.reshape_x(X_train)
X_val = TimeSeriesFormatter.reshape_x(X_val)
X_test = TimeSeriesFormatter.reshape_x(X_test)

optimal_k = 8

traffic_model = KNNModel(
n_neighbors=optimal_k, weights="uniform", algorithm="kd_tree", p=2
)
traffic_model.train_model(X_train, y_train)

y_train_hat = traffic_model.predict_model(X_train)
train_rmse = EvaluationMetrics(y_train, y_train_hat).rmse()
print("RMSE on Train Set:", train_rmse)

y_val_hat = traffic_model.predict_model(X_val)
val_rmse = EvaluationMetrics(y_val, y_val_hat).rmse()
print("RMSE on Validation Set:", val_rmse)

y_test_hat = traffic_model.predict_model(X_test)
test_rmse = EvaluationMetrics(y_test, y_test_hat).rmse()
print("RMSE on Test Set:", test_rmse)

traffic_model.save_model("artifacts/knn_model")
time_series_object.save_scaler("artifacts/minmax_scaler.gz")
13 changes: 13 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import logging
from itertools import repeat

import numpy as np
import pandas as pd


def setup_logging(file_name="logfile.log"):
Expand All @@ -11,3 +15,12 @@ def setup_logging(file_name="logfile.log"):
filemode="w",
)
return logging


def predicitons_to_df(ph, z_test, y_test_hat):
df_test = pd.DataFrame({"paris_id": [x for x in z_test for _ in repeat(None, ph)]})

df_test["time_idx"] = np.tile(np.arange(ph), len(z_test))
df_test["preds"] = np.ravel(y_test_hat)
df_test["preds"] = df_test["preds"].astype(int)
return df_test

0 comments on commit 4f6ebe9

Please sign in to comment.