Skip to content

Commit

Permalink
Merge pull request #4 from vishalmhjn/dev
Browse files Browse the repository at this point in the history
Minor: Chores and Docs
  • Loading branch information
vishalmhjn authored Jun 16, 2024
2 parents b08e9b4 + 7d41bd4 commit 0c40a8e
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 61 deletions.
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ test:

run:
source $(VENV) && cd src/ && \
$(PYTHON) main.py
$(PYTHON) main.py -m knn -t

app:
source $(VENV) && cd src/ && \
$(PYTHON) app.py
$(PYTHON) app.py

clean:
rm -rf $(ENV)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ make install

## Usage

Run the data collection, processing and machine learing pipeline:
Run the data collection, processing and machine learing pipeline (with default options):
```bash
make run
```
Expand Down
Empty file added model_output/.gitkeep
Empty file.
8 changes: 4 additions & 4 deletions src/call_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ def merge_data(self):
for i in self.list_links:
df = pd.read_csv(f"{self.read_path}/raw_data_{i}.csv")
df["t_1h"] = pd.to_datetime(df["t_1h"])
assert (
str(df["t_1h"].dt.date.min()) == self.date_formatted
), f"Data for previous day is not available via API yet. For other days, manually set the offsets in API query."

if str(df["t_1h"].dt.date.min()) == self.date_formatted:
full_data_list.append(df)
else:
logging.info("Data for %s detector is not available", i)
full_data_list.append(df)
full_data = pd.concat(full_data_list, axis=0)
full_data.to_csv(self.path, index=False)

Expand Down
2 changes: 1 addition & 1 deletion src/config_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from main import format_dates, func_path_data


previous_date, previous_date_formatted = format_dates(offset_days=1)
previous_date, previous_date_formatted = format_dates(day_delta=1)
current_date, current_date_formatted = format_dates()
file_raw_input = func_path_data(raw_data_folder, previous_date_formatted, "raw_data")
file_processed_input = func_path_data(
Expand Down
2 changes: 1 addition & 1 deletion src/config_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
TRAINING_PARAMS = {
"metric": "smape",
"training": True,
"model_output_dir": "model_output",
"model_output_dir": "../model_output",
"seed": 46,
"test_proportion": 0.15,
"validation_proportion": 0.15,
Expand Down
34 changes: 25 additions & 9 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
import argparse

from utils import format_dates, func_path_data, setup_logging

Expand All @@ -22,17 +23,33 @@
file_hist_trends = processed_data_folder / "link_historical_trends.csv"
file_sample_variance = processed_data_folder / "df_var_2023.csv"

parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model",
help="type of machine learning model",
choices=["knn", "xgboost"],
default="knn",
)
parser.add_argument(
"-t",
"--train",
action="store_true",
help="Whether train the model of not",
)
args = parser.parse_args()

if __name__ == "__main__":

# train model
model_trainer(file_model_train)
if args.train:
model_trainer(file_model_train, arg_model=args.model)

# dates for t-n-1 and t-n days
for days_offset in reversed(range(0, 2)):
previous_date, previous_date_formatted = format_dates(
offset_days=days_offset + 1
)
current_date, current_date_formatted = format_dates(days_offset)
for date_delta in reversed(range(0, 2)):

previous_date, previous_date_formatted = format_dates(day_delta=date_delta + 1)
current_date, current_date_formatted = format_dates(date_delta)
file_raw_input = func_path_data(
raw_data_folder, previous_date_formatted, "raw_data"
)
Expand All @@ -41,9 +58,7 @@
)

# run workflow
data_collector(
raw_data_folder, previous_date_formatted, offset=days_offset * 24
)
data_collector(raw_data_folder, previous_date_formatted, offset=date_delta * 24)
data_processor(
file_hist_trends,
file_static_attr,
Expand All @@ -55,4 +70,5 @@
PATH_PREDICTIONS,
file_processed_input,
current_date_formatted,
args_model=args.model,
)
33 changes: 10 additions & 23 deletions src/predict.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path
import argparse
from models import KNNModel, XGBoostModel
from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter
from utils import setup_logging, predicitons_to_df

from config_model import FORECASTING_PARAMS, TRAINING_PARAMS
from config_model import (
Expand All @@ -12,28 +13,14 @@
dynamic_features,
)

from models import KNNModel, XGBoostModel
from dataset import DataSplitter, TimeSeriesScaler, TimeSeriesFormatter
from utils import setup_logging, predicitons_to_df

lb, ph = (FORECASTING_PARAMS["lb"], FORECASTING_PARAMS["ph"])
model_output = TRAINING_PARAMS["model_output_dir"]

parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model",
help="type of machine learning model",
choices=["knn", "xgboost"],
default="knn",
)
args = parser.parse_args()

# Set up logging
logging = setup_logging("predict.log")


def predictor(predictions_folder, file_processed_input, date_formatted):
def predictor(predictions_folder, file_processed_input, date_formatted, args_model):
data_object = DataSplitter(file_processed_input)
X_formatted = data_object.df

Expand All @@ -55,19 +42,19 @@ def predictor(predictions_folder, file_processed_input, date_formatted):

W_test, X_test, z_test = series_formatter_obj.format_data(scaled_test)
X_test = TimeSeriesFormatter.reshape_x(X_test)
if args.model == "knn":
if args_model == "knn":
traffic_model = KNNModel()
traffic_model.load_model(f"{model_output}/{args.model}_model")
elif args.model == "xgboost":
traffic_model.load_model(f"{model_output}/{args_model}_model")
elif args_model == "xgboost":
traffic_model = XGBoostModel()
traffic_model.load_model(f"artifacts/{args.model}_model")
logging.info(f"Model {args.model} successfully loaded.")
traffic_model.load_model(f"artifacts/{args_model}_model")
logging.info(f"Model {args_model} successfully loaded.")

y_test_hat = traffic_model.predict_model(X_test)

df_test = predicitons_to_df(ph, z_test, y_test_hat)
df_test.to_csv(
predictions_folder / f"{args.model}_{date_formatted}.csv",
predictions_folder / f"{args_model}_{date_formatted}.csv",
index=False,
)
logging.info(f"Predictions for {date_formatted} successful.")
24 changes: 6 additions & 18 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import argparse

from config_model import TRAINING_PARAMS, FORECASTING_PARAMS
from config_model import (
continous_features,
Expand All @@ -17,24 +15,14 @@
# Set up logging
logging = setup_logging("train.log")

parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model",
help="type of machine learning model",
choices=["knn", "xgboost"],
default="knn",
)
args = parser.parse_args()


def model_trainer(path_train):
def model_trainer(path_train, arg_model):
"""wrapper for training recipe"""
data_object = DataSplitter(path_train)
X_formatted = data_object.df
_ = data_object.df

for lb, ph in [(FORECASTING_PARAMS["lb"], FORECASTING_PARAMS["ph"])]:
det_ids = data_object.get_groups
_ = data_object.get_groups

seed = TRAINING_PARAMS["seed"]
validation_prop = TRAINING_PARAMS["validation_proportion"]
Expand Down Expand Up @@ -81,12 +69,12 @@ def model_trainer(path_train):
X_val = TimeSeriesFormatter.reshape_x(X_val)
X_test = TimeSeriesFormatter.reshape_x(X_test)

if args.model == "knn":
if arg_model == "knn":
optimal_k = 2
traffic_model = KNNModel(
n_neighbors=optimal_k, weights="uniform", algorithm="kd_tree", p=2
)
elif args.model == "xgboost":
elif arg_model == "xgboost":
traffic_model = XGBoostModel(
n_estimators=300,
max_depth=5,
Expand All @@ -112,5 +100,5 @@ def model_trainer(path_train):
logging.info(f"RMSE on Test Set: {test_rmse}")

model_output = TRAINING_PARAMS["model_output_dir"]
traffic_model.save_model(f"{model_output}/{args.model}_model")
traffic_model.save_model(f"{model_output}/{arg_model}_model")
time_series_object.save_scaler(f"{model_output}/minmax_scaler.gz")
4 changes: 2 additions & 2 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def predicitons_to_df(ph, z_test, y_test_hat):
return df_test


def format_dates(offset_days=0):
base_date = datetime.today() - timedelta(offset_days)
def format_dates(day_delta=0):
base_date = datetime.today() - timedelta(day_delta)
return base_date, base_date.strftime("%Y-%m-%d")


Expand Down

0 comments on commit 0c40a8e

Please sign in to comment.